diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -38,6 +38,7 @@ X86PreAMXConfig.cpp X86LowerAMXIntrinsics.cpp X86TileConfig.cpp + X86FastPreTileConfig.cpp X86FastTileConfig.cpp X86PreTileConfig.cpp X86ExpandPseudo.cpp diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -79,6 +79,9 @@ /// Return a pass that config the tile registers. FunctionPass *createX86TileConfigPass(); +/// Return a pass that preconfig the tile registers before fast reg allocation. +FunctionPass *createX86FastPreTileConfigPass(); + /// Return a pass that config the tile registers after fast reg allocation. FunctionPass *createX86FastTileConfigPass(); @@ -175,6 +178,7 @@ void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &); void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &); void initializeX86PreTileConfigPass(PassRegistry &); +void initializeX86FastPreTileConfigPass(PassRegistry &); void initializeX86FastTileConfigPass(PassRegistry &); void initializeX86TileConfigPass(PassRegistry &); void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &); diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp @@ -0,0 +1,493 @@ +//===-- X86FastPreTileConfig.cpp - Fast Tile Register Configure------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file Pass to preconfig the shape of physical tile registers +/// It inserts ldtilecfg ahead of each group of tile registers. The algorithm +/// walk each instruction of basic block in reverse order. All the tile +/// registers that live out the basic block would be spilled and reloaded +/// before its user. It also check the depenedency of the shape to ensure +/// the shape is defined before ldtilecfg. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86MachineFunctionInfo.h" +#include "X86RegisterInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "fastpretileconfig" + +STATISTIC(NumStores, "Number of stores added"); +STATISTIC(NumLoads, "Number of loads added"); + +namespace { + +class X86FastPreTileConfig : public MachineFunctionPass { + // context + MachineFunction *MF = nullptr; + const X86Subtarget *ST = nullptr; + const TargetInstrInfo *TII = nullptr; + MachineRegisterInfo *MRI = nullptr; + X86MachineFunctionInfo *X86FI = nullptr; + MachineFrameInfo *MFI = nullptr; + const TargetRegisterInfo *TRI = nullptr; + MachineBasicBlock *MBB = nullptr; + int CfgSS = -1; + + /// Maps virtual regs to the frame index where these values are spilled. + IndexedMap StackSlotForVirtReg; + + int getStackSpaceFor(Register VirtReg); + void InitializeTileConfigStackSpace(); + void spill(MachineBasicBlock::iterator Before, Register VirtReg, bool Kill); + void reload(MachineBasicBlock::iterator UseMI, Register VirtReg, + MachineOperand *RowMO, MachineOperand *ColMO); + void convertPHI(MachineBasicBlock *MBB, MachineInstr &PHI); + void convertPHIs(MachineBasicBlock &MBB); + bool configBasicBlock(MachineBasicBlock &MBB); + +public: + X86FastPreTileConfig() : MachineFunctionPass(ID), StackSlotForVirtReg(-1) {} + + /// Return the pass name. + StringRef getPassName() const override { + return "Fast Tile Register Preconfigure"; + } + + /// Perform tile register configure. + bool runOnMachineFunction(MachineFunction &MFunc) override; + + static char ID; +}; + +} // end anonymous namespace + +char X86FastPreTileConfig::ID = 0; + +INITIALIZE_PASS_BEGIN(X86FastPreTileConfig, DEBUG_TYPE, + "Fast Tile Register Preconfigure", false, false) +INITIALIZE_PASS_END(X86FastPreTileConfig, DEBUG_TYPE, + "Fast Tile Register Preconfigure", false, false) + +/// This allocates space for the specified virtual register to be held on the +/// stack. +int X86FastPreTileConfig::getStackSpaceFor(Register VirtReg) { + // Find the location Reg would belong... + int SS = StackSlotForVirtReg[VirtReg]; + // Already has space allocated? + if (SS != -1) + return SS; + + // Allocate a new stack object for this spill location... + const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); + unsigned Size = TRI->getSpillSize(RC); + Align Alignment = TRI->getSpillAlign(RC); + int FrameIdx = MFI->CreateSpillStackObject(Size, Alignment); + + // Assign the slot. + StackSlotForVirtReg[VirtReg] = FrameIdx; + return FrameIdx; +} + +void X86FastPreTileConfig::InitializeTileConfigStackSpace() { + MachineBasicBlock &MBB = MF->front(); + MachineInstr *MI = &*MBB.getFirstNonPHI(); + DebugLoc DL; + if (ST->hasAVX512()) { + Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass); + BuildMI(MBB, MI, DL, TII->get(X86::AVX512_512_SET0), Zmm); + addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSZmr)), CfgSS) + .addReg(Zmm); + } else if (ST->hasAVX2()) { + Register Ymm = MRI->createVirtualRegister(&X86::VR256RegClass); + BuildMI(MBB, MI, DL, TII->get(X86::AVX_SET0), Ymm); + addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSYmr)), CfgSS) + .addReg(Ymm); + addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSYmr)), CfgSS, + 32) + .addReg(Ymm); + } else { + assert(ST->hasSSE2() && "AMX should assume SSE2 enabled"); + unsigned StoreOpc = ST->hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr; + Register Xmm = MRI->createVirtualRegister(&X86::VR128RegClass); + BuildMI(MBB, MI, DL, TII->get(X86::V_SET0), Xmm); + addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), CfgSS) + .addReg(Xmm); + addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), CfgSS, 16) + .addReg(Xmm); + addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), CfgSS, 32) + .addReg(Xmm); + addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), CfgSS, 48) + .addReg(Xmm); + } + // Fill in the palette first. + addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV8mi)), CfgSS) + .addImm(1); +} + +static bool dominates(MachineBasicBlock &MBB, + MachineBasicBlock::const_iterator A, + MachineBasicBlock::const_iterator B) { + auto MBBEnd = MBB.end(); + if (B == MBBEnd) + return true; + + MachineBasicBlock::const_iterator I = MBB.begin(); + for (; &*I != A && &*I != B; ++I) + ; + + return &*I == A; +} + +/// Insert spill instruction for \p AssignedReg before \p Before. +/// TODO: Update DBG_VALUEs with \p VirtReg operands with the stack slot. +void X86FastPreTileConfig::spill(MachineBasicBlock::iterator Before, + Register VirtReg, bool Kill) { + LLVM_DEBUG(dbgs() << "Spilling " << printReg(VirtReg, TRI) << " \n"); + int FI = getStackSpaceFor(VirtReg); + LLVM_DEBUG(dbgs() << " to stack slot #" << FI << '\n'); + + const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); + // Don't need shape information for tile store, becasue it is adjacent to + // the tile def instruction. + TII->storeRegToStackSlot(*MBB, Before, VirtReg, Kill, FI, &RC, TRI); + ++NumStores; + + // TODO: update DBG_VALUEs +} + +/// Insert reload instruction for \p PhysReg before \p Before. +void X86FastPreTileConfig::reload(MachineBasicBlock::iterator UseMI, + Register OrigReg, MachineOperand *RowMO, + MachineOperand *ColMO) { + int FI = getStackSpaceFor(OrigReg); + const TargetRegisterClass &RC = *MRI->getRegClass(OrigReg); + Register TileReg; + // Fold copy to tileload + // BB1: + // spill src to s + // + // BB2: + // t = copy src + // --> + // t = tileload (s) + if (UseMI->isCopy()) + TileReg = UseMI->getOperand(0).getReg(); + else + TileReg = MRI->createVirtualRegister(&RC); + // Can't use TII->loadRegFromStackSlot(), because we need the shape + // information for reload. + // tileloadd (%sp, %idx), %tmm + unsigned Opc = X86::PTILELOADDV; + Register StrideReg = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass); + // FIXME: MBB is not the parent of UseMI. + MachineInstr *NewMI = BuildMI(*UseMI->getParent(), UseMI, DebugLoc(), + TII->get(X86::MOV64ri), StrideReg) + .addImm(64); + NewMI = addFrameReference( + BuildMI(*UseMI->getParent(), UseMI, DebugLoc(), TII->get(Opc), TileReg) + .addReg(RowMO->getReg()) + .addReg(ColMO->getReg()), + FI); + MachineOperand &MO = NewMI->getOperand(5); + MO.setReg(StrideReg); + MO.setIsKill(true); + RowMO->setIsKill(false); + ColMO->setIsKill(false); + // Erase copy instruction after it is folded. + if (UseMI->isCopy()) { + UseMI->eraseFromParent(); + } else { + // Replace the register in the user MI. + for (auto &MO : UseMI->operands()) { + if (MO.isReg() && MO.getReg() == OrigReg) + MO.setReg(TileReg); + } + } + + ++NumLoads; + LLVM_DEBUG(dbgs() << "Reloading " << printReg(OrigReg, TRI) << " into " + << printReg(TileReg, TRI) << '\n'); +} + +static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) { + // The instruction must have 3 operands: tile def, row, col. + if (MI.isDebugInstr() || MI.getNumOperands() < 3 || !MI.isPseudo()) + return false; + MachineOperand &MO = MI.getOperand(0); + + if (MO.isReg()) { + Register Reg = MO.getReg(); + // FIXME it may be used after Greedy RA and the physical + // register is not rewritten yet. + if (Reg.isVirtual() && + MRI->getRegClass(Reg)->getID() == X86::TILERegClassID) + return true; + if (Reg >= X86::TMM0 && Reg <= X86::TMM7) + return true; + } + + return false; +} + +static ShapeT getShape(MachineRegisterInfo *MRI, Register TileReg) { + MachineInstr *MI = MRI->getVRegDef(TileReg); + if (isTileDef(MRI, *MI)) { + MachineOperand *RowMO = &MI->getOperand(1); + MachineOperand *ColMO = &MI->getOperand(2); + return ShapeT(RowMO, ColMO, MRI); + } else if (MI->isCopy()) { + TileReg = MI->getOperand(1).getReg(); + return getShape(MRI, TileReg); + } + + // The def should not be PHI node, because we walk the MBB in reverse post + // order. + assert(MI->isPHI() && "Unexpected PHI when get shape."); + llvm_unreachable("Unexpected MI when get shape."); +} + +// BB0: +// spill t0 to s0 +// BB1: +// spill t1 to s1 +// +// BB2: +// t = phi [t0, bb0] [t1, bb1] +// --> +// row = phi [r0, bb0] [r1, bb1] +// col = phi [c0, bb0] [c1, bb1] +// s = phi [s0, bb0] [s1, bb1] +// t = tileload row, col, s +// The new instruction is inserted at the end of the phi node. The order +// of the original phi node is not ensured. +void X86FastPreTileConfig::convertPHI(MachineBasicBlock *MBB, + MachineInstr &PHI) { + // 1. Create instruction to get stack slot address of each incoming block. + // 2. Create PHI node for the stack address. + // 3. Create PHI node for shape. If one of the incoming shape is immediate + // use the immediate and delete the PHI node. + // 4. Create tileload instruction from the stack address. + Register StackAddrReg = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass); + MachineInstrBuilder AddrPHI = BuildMI(*MBB, ++PHI.getIterator(), DebugLoc(), + TII->get(X86::PHI), StackAddrReg); + Register RowReg = MRI->createVirtualRegister(&X86::GR16RegClass); + MachineInstrBuilder RowPHI = BuildMI(*MBB, ++PHI.getIterator(), DebugLoc(), + TII->get(X86::PHI), RowReg); + Register ColReg = MRI->createVirtualRegister(&X86::GR16RegClass); + MachineInstrBuilder ColPHI = BuildMI(*MBB, ++PHI.getIterator(), DebugLoc(), + TII->get(X86::PHI), ColReg); + for (unsigned I = 1, E = PHI.getNumOperands(); I != E; I += 2) { + Register InTileReg = PHI.getOperand(I).getReg(); + MachineBasicBlock::iterator InsertPos; + MachineInstr *TileDefMI = MRI->getVRegDef(InTileReg); + if (TileDefMI->isPHI()) { + convertPHI(TileDefMI->getParent(), *TileDefMI); + TileDefMI = MRI->getVRegDef(InTileReg); + InsertPos = TileDefMI->getParent()->getFirstNonPHI(); + } else { + InsertPos = TileDefMI->getIterator(); + } + MachineBasicBlock *InMBB = PHI.getOperand(I + 1).getMBB(); + int FI = getStackSpaceFor(InTileReg); + Register InStackAddrReg = + MRI->createVirtualRegister(&X86::GR64_NOSPRegClass); + addOffset(BuildMI(*TileDefMI->getParent(), InsertPos, DebugLoc(), + TII->get(X86::LEA64r), InStackAddrReg) + .addFrameIndex(FI), + 0); + AddrPHI.addReg(InStackAddrReg).addMBB(InMBB); + ShapeT Shape = getShape(MRI, InTileReg); + Shape.getRow()->setIsKill(false); + Shape.getCol()->setIsKill(false); + RowPHI.addReg(Shape.getRow()->getReg()).addMBB(InMBB); + ColPHI.addReg(Shape.getCol()->getReg()).addMBB(InMBB); + } + + MachineBasicBlock::iterator InsertPos = MBB->getFirstNonPHI(); + Register StrideReg = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass); + BuildMI(*MBB, InsertPos, DebugLoc(), TII->get(X86::MOV64ri), StrideReg) + .addImm(64); + Register TileReg = PHI.getOperand(0).getReg(); + MachineInstr *NewMI = addDirectMem( + BuildMI(*MBB, InsertPos, DebugLoc(), TII->get(X86::PTILELOADDV), TileReg) + .addReg(RowReg) + .addReg(ColReg), + StackAddrReg); + MachineOperand &MO = NewMI->getOperand(5); + MO.setReg(StrideReg); + MO.setIsKill(true); + PHI.eraseFromParent(); +} + +static bool isTileRegDef(MachineRegisterInfo *MRI, MachineInstr &MI) { + MachineOperand &MO = MI.getOperand(0); + if (MO.isReg() && MO.getReg().isVirtual() && + MRI->getRegClass(MO.getReg())->getID() == X86::TILERegClassID) + return true; + return false; +} + +void X86FastPreTileConfig::convertPHIs(MachineBasicBlock &MBB) { + MachineInstr *BeginMI = &*MBB.getFirstNonPHI(); + MachineBasicBlock::reverse_iterator Begin = BeginMI->getReverseIterator(); + MachineBasicBlock::reverse_iterator End = MBB.rend(); + for (MachineBasicBlock::reverse_iterator I = Begin; I != End;) { + MachineInstr &MI = *I; + ++I; + if (!MI.isPHI() || !isTileRegDef(MRI, MI)) + continue; + convertPHI(&MBB, MI); + } +} + +// PreTileConfig should configure the tile registers based on basic +// block. +bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) { + this->MBB = &MBB; + bool Change = false; + MachineInstr *LastShapeMI = nullptr; + MachineInstr *LastTileCfg = nullptr; + + for (MachineInstr &MI : reverse(MBB)) { + // We have transformed phi node before configuring BB. + if (MI.isPHI()) + break; + if (!isTileDef(MRI, MI)) + continue; + Change = true; + // If MI dominate the last shape def instruction, we need insert + // ldtilecfg after LastShapeMI now. The config doesn't include + // current MI. + // tilezero + // ldtilecfg <- insert + // def row + // def col + // tilezero(row, col) + if (LastShapeMI && dominates(MBB, MI, LastShapeMI)) { + auto II = LastShapeMI->getIterator(); + if (CfgSS == -1) + CfgSS = MFI->CreateStackObject(ST->getTileConfigSize(), + ST->getTileConfigAlignment(), false); + LastTileCfg = addFrameReference( + BuildMI(MBB, ++II, DebugLoc(), TII->get(X86::LDTILECFG)), CfgSS); + LastShapeMI = nullptr; + } + MachineOperand *RowMO = &MI.getOperand(1); + MachineOperand *ColMO = &MI.getOperand(2); + MachineInstr *RowMI = MRI->getVRegDef(RowMO->getReg()); + MachineInstr *ColMI = MRI->getVRegDef(ColMO->getReg()); + // If the shape is defined in current MBB, check the domination. + // FIXME how about loop? + if (RowMI->getParent() == &MBB) { + if (!LastShapeMI) + LastShapeMI = RowMI; + else if (dominates(MBB, LastShapeMI, RowMI)) + LastShapeMI = RowMI; + } + if (ColMI->getParent() == &MBB) { + if (!LastShapeMI) + LastShapeMI = ColMI; + else if (dominates(MBB, LastShapeMI, ColMI)) + LastShapeMI = ColMI; + } + // If there is user live out of the MBB, spill it and reload in before the + // user. + bool Spilled = false; + Register TileReg = MI.getOperand(0).getReg(); + for (MachineInstr &UseMI : MRI->use_instructions(TileReg)) { + if (UseMI.getParent() == &MBB) { + // check user should not across ldtilecfg + if (!LastTileCfg || !dominates(MBB, LastTileCfg, UseMI)) + continue; + if (!Spilled) { + spill(++MI.getIterator(), TileReg, false); + Spilled = true; + } + // reload befor UseMI + reload(UseMI.getIterator(), TileReg, RowMO, ColMO); + } else { + if (!Spilled) { + spill(++MI.getIterator(), TileReg, false); + Spilled = true; + } + // Don't reload for phi instruction, we handle phi reload separately. + // TODO: merge the reload for the same user MBB. + if (!UseMI.isPHI()) + reload(UseMI.getIterator(), TileReg, RowMO, ColMO); + } + } + } + + // Configure tile registers at the head of the MBB + if (Change) { + MachineInstr *Before; + if (LastShapeMI == nullptr || LastShapeMI->isPHI()) + Before = &*MBB.getFirstNonPHI(); + else + Before = &*(++LastShapeMI->getIterator()); + + if (CfgSS == -1) + CfgSS = MFI->CreateStackObject(ST->getTileConfigSize(), + ST->getTileConfigAlignment(), false); + addFrameReference( + BuildMI(MBB, Before, DebugLoc(), TII->get(X86::LDTILECFG)), CfgSS); + } + + return Change; +} + +bool X86FastPreTileConfig::runOnMachineFunction(MachineFunction &MFunc) { + MF = &MFunc; + MRI = &MFunc.getRegInfo(); + ST = &MFunc.getSubtarget(); + TII = ST->getInstrInfo(); + X86FI = MFunc.getInfo(); + MFI = &MFunc.getFrameInfo(); + TRI = ST->getRegisterInfo(); + CfgSS = -1; + + unsigned NumVirtRegs = MRI->getNumVirtRegs(); + StackSlotForVirtReg.resize(NumVirtRegs); + bool Change = false; + + assert(MRI->isSSA()); + // Loop over all of the basic blocks in reverse post order and insert + // ldtilecfg for tile registers. The reserse post order is to facilitate + // PHI node convert. + ReversePostOrderTraversal RPOT(MF); + for (MachineBasicBlock *MBB : RPOT) { + convertPHIs(*MBB); + Change |= configBasicBlock(*MBB); + } + + if (Change) + InitializeTileConfigStackSpace(); + + StackSlotForVirtReg.clear(); + return Change; +} + +FunctionPass *llvm::createX86FastPreTileConfigPass() { + return new X86FastPreTileConfig(); +} diff --git a/llvm/lib/Target/X86/X86FastTileConfig.cpp b/llvm/lib/Target/X86/X86FastTileConfig.cpp --- a/llvm/lib/Target/X86/X86FastTileConfig.cpp +++ b/llvm/lib/Target/X86/X86FastTileConfig.cpp @@ -40,40 +40,25 @@ class X86FastTileConfig : public MachineFunctionPass { // context MachineFunction *MF = nullptr; - const X86Subtarget *ST = nullptr; - const TargetRegisterInfo *TRI = nullptr; const TargetInstrInfo *TII = nullptr; MachineRegisterInfo *MRI = nullptr; + const TargetRegisterInfo *TRI = nullptr; X86MachineFunctionInfo *X86FI = nullptr; - MachineInstr *getTileConfigPoint(); - void tileConfig(); + bool configBasicBlock(MachineBasicBlock &MBB); public: X86FastTileConfig() : MachineFunctionPass(ID) {} - bool fastTileConfig(); - bool isTileLoad(MachineInstr &MI); - bool isTileStore(MachineInstr &MI); - bool isAMXInstr(MachineInstr &MI); - - MachineInstr *getKeyAMXInstr(MachineInstr *MI); - void getTileShapesCfg(MachineInstr *MI, - SmallVector &ShapedTiles); - void getShapeCfgInstrs(MachineInstr *MI, - std::map &RowCfgs, - std::map &ColCfgs); - /// Return the pass name. StringRef getPassName() const override { return "Fast Tile Register Configure"; } - void materializeTileCfg(MachineInstr *MI); - - void rewriteTileCfg(SmallVector &ShapedTiles, - std::map &RowCfgs, - std::map &ColCfgs); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } /// Perform register allocation. bool runOnMachineFunction(MachineFunction &MFunc) override; @@ -95,210 +80,105 @@ INITIALIZE_PASS_END(X86FastTileConfig, DEBUG_TYPE, "Fast Tile Register Configure", false, false) -static bool isTilePhysReg(MachineOperand &Op) { - if (!Op.isReg()) +static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) { + // There is no phi instruction after register allocation. + assert(MI.isPHI() == false); + // The instruction must have 3 operands: tile def, row, col. + // It should be AMX pseudo instruction that have shape operand. + if (MI.isDebugInstr() || MI.isCopy() || MI.getNumOperands() < 3 || + !MI.isPseudo()) return false; + MachineOperand &MO = MI.getOperand(0); + + if (MO.isReg()) { + Register Reg = MO.getReg(); + // FIXME it may be used after Greedy RA and the physical + // register is not rewritten yet. + if (Reg.isVirtual() && + MRI->getRegClass(Reg)->getID() == X86::TILERegClassID) + return true; + if (Reg >= X86::TMM0 && Reg <= X86::TMM7) + return true; + } - Register Reg = Op.getReg(); - if (Reg >= X86::TMM0 && Reg <= X86::TMM7) - return true; return false; } -static unsigned getTilePhysRegIdx(MachineOperand *Op) { - assert(isTilePhysReg(*Op) && "Tile Operand is invalid"); - return Op->getReg() - X86::TMM0; -} - -static inline void adjustRowCfg(unsigned TIdx, MachineInstr *MI) { - unsigned Offset = 48 + TIdx; - MI->getOperand(3).ChangeToImmediate(Offset); -} - -static inline void adjustColCfg(unsigned TIdx, MachineInstr *MI) { - unsigned Offset = 16 + TIdx * 2; - MI->getOperand(3).ChangeToImmediate(Offset); -} - -bool X86FastTileConfig::isTileLoad(MachineInstr &MI) { - return MI.getOpcode() == X86::PTILELOADDV || - MI.getOpcode() == X86::PTILELOADDT1V; -} -bool X86FastTileConfig::isTileStore(MachineInstr &MI) { - return MI.getOpcode() == X86::PTILESTOREDV; -} -bool X86FastTileConfig::isAMXInstr(MachineInstr &MI) { - // TODO: May need to handle some special nontile amx instrucion. - if (MI.getOpcode() == X86::PLDTILECFGV || MI.isDebugInstr()) - return false; - - return llvm::any_of(MI.operands(), isTilePhysReg); -} - -MachineInstr *X86FastTileConfig::getKeyAMXInstr(MachineInstr *MI) { - auto Cfg = MachineBasicBlock::iterator(MI); - MachineBasicBlock *MBB = MI->getParent(); - MachineInstr *KeyMI = nullptr; - int KeyAMXNum = 0; - - for (auto II = Cfg; II != MBB->end(); II++) { - if (isTileLoad(*II)) { - KeyMI = &*II; +// PreTileConfig should configure the tile registers based on basic +// block. +bool X86FastTileConfig::configBasicBlock(MachineBasicBlock &MBB) { + bool Change = false; + SmallVector, 6> ShapeInfos; + for (MachineInstr &MI : reverse(MBB)) { + if (!isTileDef(MRI, MI) && MI.getOpcode() != X86::LDTILECFG) continue; + // AMX instructions that define tile register. + if (MI.getOpcode() != X86::LDTILECFG) { + MachineOperand &Row = MI.getOperand(1); + MachineOperand &Col = MI.getOperand(2); + unsigned TMMIdx = MI.getOperand(0).getReg() - X86::TMM0; + ShapeInfos.push_back({TMMIdx, ShapeT(&Row, &Col)}); + } else { // LDTILECFG + // Rewrite the shape information to memory. Stack slot should have + // been initialized to zero in pre config. + int SS = MI.getOperand(0).getIndex(); // tile config stack slot. + for (auto &ShapeInfo : ShapeInfos) { + DebugLoc DL; + unsigned TMMIdx = ShapeInfo.first; + Register RowReg = ShapeInfo.second.getRow()->getReg(); + Register ColReg = ShapeInfo.second.getCol()->getReg(); + // Here is the data format for the tile config. + // 0 palette + // 1 start_row + // 2-15 reserved, must be zero + // 16-17 tile0.colsb Tile 0 bytes per row. + // 18-19 tile1.colsb Tile 1 bytes per row. + // 20-21 tile2.colsb Tile 2 bytes per row. + // ... (sequence continues) + // 30-31 tile7.colsb Tile 7 bytes per row. + // 32-47 reserved, must be zero + // 48 tile0.rows Tile 0 rows. + // 49 tile1.rows Tile 1 rows. + // 50 tile2.rows Tile 2 rows. + // ... (sequence continues) + // 55 tile7.rows Tile 7 rows. + // 56-63 reserved, must be zero + int RowOffset = 48 + TMMIdx; + int ColOffset = 16 + TMMIdx * 2; + MachineInstrBuilder StoreRow = + BuildMI(MBB, MI, DL, TII->get(X86::MOV8mr)); + addFrameReference(StoreRow, SS, RowOffset) + .addReg(TRI->getSubReg(RowReg, X86::sub_8bit)); + + MachineInstrBuilder StoreCol = + BuildMI(MBB, MI, DL, TII->get(X86::MOV16mr)); + addFrameReference(StoreCol, SS, ColOffset).addReg(ColReg); + } + ShapeInfos.clear(); + Change = true; } - - if (isTileStore(*II)) { - assert(KeyMI && "Key AMX Should be found before!"); - break; - } - - if (isAMXInstr(*II)) { - assert((KeyAMXNum == 0) && "Too many Key AMX instruction!"); - (void) KeyAMXNum; - KeyAMXNum++; - KeyMI = &*II; - } - } - assert(KeyMI && "There must be an AMX instruction."); - return KeyMI; -} - -// Orderly get the tiles in key amx instruction, uses before defs. -void X86FastTileConfig::getTileShapesCfg( - MachineInstr *CfgMI, SmallVector &ShapedTiles) { - MachineInstr *KeyMI = getKeyAMXInstr(CfgMI); - - SmallVector DefTiles; - for (MachineOperand &MO : KeyMI->operands()) { - if (!isTilePhysReg(MO)) - continue; - if (MO.isDef()) - DefTiles.push_back(&MO); - else - ShapedTiles.push_back(&MO); - } - ShapedTiles.append(DefTiles); -} - -// We pre-config the shapes at position named with "amx.tmm.N.shape.row* and -// amx.shape.N.col*" at pass "Pre AMX Tile Config". -// The 'N' implies the order of tiles in key amx intrinsic. -void X86FastTileConfig::getShapeCfgInstrs( - MachineInstr *MI, std::map &RowCfgs, - std::map &ColCfgs) { - auto Cfg = MachineBasicBlock::iterator(MI); - MachineBasicBlock *MBB = MI->getParent(); - - for (auto II = Cfg; II != MBB->begin(); II--) { - if (isAMXInstr(*II) || II->isTerminator() || II->isCall()) - break; - if (!II->mayStore() || !II->hasOneMemOperand()) - continue; - const Value *MemPtr = II->memoperands()[0]->getValue(); - if (!MemPtr) - continue; - - StringRef Name = MemPtr->getName(); - if (!Name.startswith("amx.tmm.")) - continue; - - // Get the 'N'th tile shape config in key amx instruction. - auto N = Name.find(".shape"); - StringRef STileIdx = Name.slice(8, N); - unsigned Idx; - STileIdx.getAsInteger(10, Idx); - - // And related them with their store instructions. - if (Name.contains("row")) - RowCfgs[Idx] = &*II; - else if (Name.contains("col")) - ColCfgs[Idx] = &*II; - else - llvm_unreachable("Invalid tile shape info!"); } - assert((RowCfgs.size() == ColCfgs.size()) && - "The number of tile row and col must be equal!"); -} - -// Here is the data format for the tile config. -// 0 palette = 1 now. -// 1 start_row = 0 now. -// 2-15 reserved, must be zero -// 16-17 tile0.colsb Tile 0 bytes per row. -// 18-19 tile1.colsb Tile 1 bytes per row. -// 20-21 tile2.colsb Tile 2 bytes per row. -// ... (sequence continues) -// 30-31 tile7.colsb Tile 7 bytes per row. -// 32-47 reserved, must be zero -// 48 tile0.rows Tile 0 rows. -// 49 tile1.rows Tile 1 rows. -// 50 tile2.rows Tile 2 rows. -// ... (sequence continues) -// 55 tile7.rows Tile 7 rows. -// 56-63 reserved, must be zero -void X86FastTileConfig::rewriteTileCfg( - SmallVector &ShapedTiles, - std::map &RowCfgs, - std::map &ColCfgs) { - assert((RowCfgs.size() == ShapedTiles.size()) && - "The number of tile shapes not equal with the number of tiles!"); - // Orderly get the tiles and adjust the shape config. - for (unsigned I = 0, E = ShapedTiles.size(); I < E; I++) { - MachineOperand *MO = ShapedTiles[I]; - unsigned TmmIdx = getTilePhysRegIdx(MO); - if (I == TmmIdx) - continue; - adjustRowCfg(TmmIdx, RowCfgs[I]); - adjustColCfg(TmmIdx, ColCfgs[I]); - } -} - -// We have already preconfig the shapes before fast register allocation at -// X86PreAMXConfig::preWriteTileCfg(). Now, we have done fast register -// allocation, the shapes pre-written before may not rightly corresponding -// to the correct tmm registers, so we need adjust them. -void X86FastTileConfig::materializeTileCfg(MachineInstr *CfgMI) { - SmallVector ShapedTiles; - std::map RowCfgs; - std::map ColCfgs; - - // Orderly keep the tile uses and def in ShapedTiles; - getTileShapesCfg(CfgMI, ShapedTiles); - assert(ShapedTiles.size() && "Not find shapes config!"); - - getShapeCfgInstrs(CfgMI, RowCfgs, ColCfgs); - - rewriteTileCfg(ShapedTiles, RowCfgs, ColCfgs); -} - -bool X86FastTileConfig::fastTileConfig() { - bool Changed = false; - - for (MachineBasicBlock &MBB : *MF) { - SmallVector CFGs; - for (MachineInstr &MI : MBB) - if (MI.getOpcode() == X86::PLDTILECFGV) - CFGs.push_back(&MI); - for (auto *MI : CFGs) - materializeTileCfg(MI); - if (!CFGs.empty()) - Changed = true; - } - if (Changed) + if (Change) X86FI->setHasVirtualTileReg(true); - return Changed; + + return Change; } bool X86FastTileConfig::runOnMachineFunction(MachineFunction &MFunc) { MF = &MFunc; MRI = &MFunc.getRegInfo(); - ST = &MFunc.getSubtarget(); + const TargetSubtargetInfo *ST = &MFunc.getSubtarget(); TRI = ST->getRegisterInfo(); TII = MFunc.getSubtarget().getInstrInfo(); X86FI = MFunc.getInfo(); + bool Change = false; + + // Loop over all of the basic blocks, eliminating virtual register references + for (MachineBasicBlock &MBB : MFunc) + Change |= configBasicBlock(MBB); - return fastTileConfig(); + return Change; } FunctionPass *llvm::createX86FastTileConfigPass() { diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -48,22 +48,23 @@ VEX, T8XD; // Pseduo instruction for RA. - let mayLoad = 1 in + let isPseudo = true, mayLoad = 1 in def PLDTILECFGV : PseudoI<(outs), (ins opaquemem:$src), [(int_x86_ldtilecfg_internal addr:$src)]>; - let mayLoad = 1 in + let isPseudo = true, mayLoad = 1 in def PTILELOADDV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2, opaquemem:$src3), []>; - let mayLoad = 1 in + let isPseudo = true, mayLoad = 1 in def PTILELOADDT1V : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2, opaquemem:$src3), []>; - let mayStore = 1 in + let isPseudo = true, mayStore = 1 in def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1, GR16:$src2, opaquemem:$src3, TILE:$src4), []>; - let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1 in + let isPseudo = true, isReMaterializable = 1, isAsCheapAsAMove = 1, + canFoldAsLoad = 1 in def PTILEZEROV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2), [(set TILE:$dst, (int_x86_tilezero_internal GR16:$src1, GR16:$src2))]>; @@ -106,7 +107,7 @@ } // Pseduo instruction for RA. - let Constraints = "$src4 = $dst" in { + let isPseudo = true, Constraints = "$src4 = $dst" in { def PTDPBSSDV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2, GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6), @@ -165,7 +166,7 @@ []>, VEX_4V, T8XS; // Pseduo instruction for RA. - let Constraints = "$src4 = $dst" in + let isPseudo = true, Constraints = "$src4 = $dst" in def PTDPBF16PSV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, GR16:$src2, GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6), diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -78,6 +78,7 @@ initializeX86CallFrameOptimizationPass(PR); initializeX86CmovConverterPassPass(PR); initializeX86TileConfigPass(PR); + initializeX86FastPreTileConfigPass(PR); initializeX86FastTileConfigPass(PR); initializeX86LowerTileCopyPass(PR); initializeX86ExpandPseudoPass(PR); @@ -420,8 +421,8 @@ addPass(createX86LowerAMXIntrinsicsPass()); addPass(createX86LowerAMXTypePass()); - if (TM->getOptLevel() == CodeGenOpt::None) - addPass(createX86PreAMXConfigPass()); + // if (TM->getOptLevel() == CodeGenOpt::None) + // addPass(createX86PreAMXConfigPass()); TargetPassConfig::addIRPasses(); @@ -511,9 +512,10 @@ addPass(createX86FlagsCopyLoweringPass()); addPass(createX86DynAllocaExpander()); - if (getOptLevel() != CodeGenOpt::None) { + if (getOptLevel() != CodeGenOpt::None) addPass(createX86PreTileConfigPass()); - } + else + addPass(createX86FastPreTileConfigPass()); } void X86PassConfig::addMachineSSAOptimization() { diff --git a/llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll b/llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll --- a/llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll @@ -12,6 +12,9 @@ ; AVX512-NEXT: movq %rsp, %rbp ; AVX512-NEXT: andq $-1024, %rsp # imm = 0xFC00 ; AVX512-NEXT: subq $25600, %rsp # imm = 0x6400 +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %dx, %ax ; AVX512-NEXT: movw %si, %cx ; AVX512-NEXT: movl %edi, {{[0-9]+}}(%rsp) @@ -20,6 +23,7 @@ ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi ; AVX512-NEXT: xorl %esi, %esi ; AVX512-NEXT: movl $1088, %edx # imm = 0x440 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq memset@PLT ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax ; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) @@ -44,10 +48,12 @@ ; AVX512-NEXT: # %bb.1: # %if.then ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $buf, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movabsq $buf, %rax +; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movw (%rax), %si +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movw 2(%rax), %dx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -57,15 +63,11 @@ ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %dil -; AVX512-NEXT: movb %dil, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: addq $64, %rdx @@ -73,37 +75,39 @@ ; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $buf, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movabsq $buf, %rax +; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movw (%rax), %di +; AVX512-NEXT: movw (%rax), %si +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movw 2(%rax), %dx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %r8b -; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX512-NEXT: tileloadd (%rdx,%rdi), %tmm0 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: addq $64, %rdx +; AVX512-NEXT: movl $64, %esi ; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $buf, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movabsq $buf, %rax +; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movw (%rax), %si +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movw 2(%rax), %dx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -113,15 +117,11 @@ ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %r8b -; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg (%rdi) ; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: addq $64, %rdx @@ -131,10 +131,12 @@ ; AVX512-NEXT: .LBB0_2: # %if.else ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $buf2, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movabsq $buf2, %rax +; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movw (%rax), %si +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movw 2(%rax), %dx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -144,15 +146,11 @@ ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %dil -; AVX512-NEXT: movb %dil, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: addq $64, %rdx @@ -160,37 +158,39 @@ ; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $buf2, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movabsq $buf2, %rax +; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movw (%rax), %di +; AVX512-NEXT: movw (%rax), %si +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movw 2(%rax), %dx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %r8b -; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX512-NEXT: tileloadd (%rdx,%rdi), %tmm0 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: addq $64, %rdx +; AVX512-NEXT: movl $64, %esi ; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $buf2, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movabsq $buf2, %rax +; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movw (%rax), %si +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movw 2(%rax), %dx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -200,15 +200,11 @@ ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %r8b -; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg (%rdi) ; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: addq $64, %rdx @@ -219,7 +215,6 @@ ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi ; AVX512-NEXT: movl $1088, %edx # imm = 0x440 ; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq memcpy@PLT ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi @@ -516,23 +511,17 @@ ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d -; AVX512-NEXT: movw %r10w, %di -; AVX512-NEXT: shrl $2, %r10d -; AVX512-NEXT: movw %r10w, %r9w -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %r8b -; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %r8d +; AVX512-NEXT: movw %r8w, %di +; AVX512-NEXT: shrl $2, %r8d +; AVX512-NEXT: movw %r8w, %r9w +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp) -; AVX512-NEXT: # kill: def $r10b killed $r10b killed $r10d -; AVX512-NEXT: movb %r10b, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %r9b, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; AVX512-NEXT: movl $64, %r8d @@ -599,9 +588,9 @@ ; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Reload ; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %dx # 2-byte Reload ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: # kill: def $rdi killed $rax ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3 @@ -617,12 +606,10 @@ ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm16 ; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm16, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp) @@ -638,18 +625,15 @@ ; AVX512-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r8 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %r9b -; AVX512-NEXT: movb %r9b, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg (%r8) ; AVX512-NEXT: movl $64, %r8d ; AVX512-NEXT: tileloadd (%rdi,%r8), %tmm0 ; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) diff --git a/llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll b/llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll --- a/llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll @@ -10,7 +10,10 @@ ; AVX512-NEXT: pushq %rbp ; AVX512-NEXT: movq %rsp, %rbp ; AVX512-NEXT: andq $-1024, %rsp # imm = 0xFC00 -; AVX512-NEXT: subq $6144, %rsp # imm = 0x1800 +; AVX512-NEXT: subq $8192, %rsp # imm = 0x2000 +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %dx, %ax ; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512-NEXT: movw %si, %ax @@ -30,34 +33,29 @@ ; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %sil -; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload ; AVX512-NEXT: movl $buf, %r9d ; AVX512-NEXT: movl $32, %r10d ; AVX512-NEXT: movw $8, %si +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp) +; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; AVX512-NEXT: tileloadd (%r9,%r10), %tmm0 -; AVX512-NEXT: movl $64, %r8d -; AVX512-NEXT: tilestored %tmm0, (%r11,%r8) -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $8, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movl $64, %r9d +; AVX512-NEXT: movw $8, %si +; AVX512-NEXT: tilestored %tmm0, (%r8,%r9) +; AVX512-NEXT: movl $buf, %r8d +; AVX512-NEXT: movl $32, %r9d +; AVX512-NEXT: movw $8, %si +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX512-NEXT: tileloadd (%r9,%r10), %tmm0 +; AVX512-NEXT: tileloadd (%r8,%r9), %tmm0 +; AVX512-NEXT: movl $64, %r8d +; AVX512-NEXT: movw $8, %si ; AVX512-NEXT: tilestored %tmm0, (%rdi,%r8) -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %dil -; AVX512-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg (%rsi) ; AVX512-NEXT: movl $buf, %esi ; AVX512-NEXT: movl $32, %edi ; AVX512-NEXT: tileloadd (%rsi,%rdi), %tmm0 @@ -69,34 +67,29 @@ ; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %sil -; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload ; AVX512-NEXT: movl $buf2, %r9d ; AVX512-NEXT: movl $32, %r10d ; AVX512-NEXT: movw $8, %si +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp) +; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; AVX512-NEXT: tileloadd (%r9,%r10), %tmm0 -; AVX512-NEXT: movl $64, %r8d -; AVX512-NEXT: tilestored %tmm0, (%r11,%r8) -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $8, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movl $64, %r9d +; AVX512-NEXT: movw $8, %si +; AVX512-NEXT: tilestored %tmm0, (%r8,%r9) +; AVX512-NEXT: movl $buf2, %r8d +; AVX512-NEXT: movl $32, %r9d +; AVX512-NEXT: movw $8, %si +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX512-NEXT: tileloadd (%r9,%r10), %tmm0 +; AVX512-NEXT: tileloadd (%r8,%r9), %tmm0 +; AVX512-NEXT: movl $64, %r8d +; AVX512-NEXT: movw $8, %si ; AVX512-NEXT: tilestored %tmm0, (%rdi,%r8) -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %dil -; AVX512-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg (%rsi) ; AVX512-NEXT: movl $buf2, %esi ; AVX512-NEXT: movl $32, %edi ; AVX512-NEXT: tileloadd (%rsi,%rdi), %tmm0 @@ -106,36 +99,39 @@ ; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload ; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %sil -; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: movl $64, %r10d +; AVX512-NEXT: movw $8, %di +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp) +; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AVX512-NEXT: tileloadd (%r8,%r10), %tmm0 +; AVX512-NEXT: movabsq $64, %r8 +; AVX512-NEXT: tilestored %tmm0, 1024(%rsp,%r8) # 1024-byte Folded Spill +; AVX512-NEXT: movl $64, %r10d +; AVX512-NEXT: movw $8, %r8w +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $8, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX512-NEXT: movl $64, %esi -; AVX512-NEXT: movw $8, %di -; AVX512-NEXT: tileloadd (%r10,%rsi), %tmm1 -; AVX512-NEXT: tileloadd (%r9,%rsi), %tmm2 -; AVX512-NEXT: tileloadd (%r8,%rsi), %tmm0 +; AVX512-NEXT: tileloadd (%r9,%r10), %tmm2 +; AVX512-NEXT: movl $64, %r8d +; AVX512-NEXT: tileloadd (%rsi,%r8), %tmm0 +; AVX512-NEXT: movw $8, %si +; AVX512-NEXT: movabsq $64, %r8 +; AVX512-NEXT: tileloadd 1024(%rsp,%r8), %tmm1 # 1024-byte Folded Reload ; AVX512-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 +; AVX512-NEXT: movl $64, %esi ; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %dil -; AVX512-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg (%rsi) ; AVX512-NEXT: movl $64, %esi ; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; AVX512-NEXT: movl $buf, %edx diff --git a/llvm/test/CodeGen/X86/AMX/amx-fastconfig-phi.mir b/llvm/test/CodeGen/X86/AMX/amx-fastconfig-phi.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/AMX/amx-fastconfig-phi.mir @@ -0,0 +1,177 @@ +# RUN: llc -mtriple=x86_64-- -run-pass=fastpretileconfig -o - %s | FileCheck %s +# +# This case test tile phi is nested accessed, but the its def block is +# not visited yet. +# +# BB.5 +# %6 = phi(%3, b%10) <----- +# | | | +# | | | +# BB.6 BB.7 | +# \ / | +# \ / | +# \ / | +# BB.8 ------------- +# %10 = phi(%8, %9) +# +# #define STRIDE 32 +# void foo(int cond, char *buf) { +# __tile1024i a = {16, 64}; +# __tile1024i b = {16, 64}; +# __tile1024i c = {16, 64}; +# +# if (cond) { +# __tile_zero(&c); +# } else { +# __tile_loadd(&c, buf, STRIDE); +# } +# __tile_zero(&a); +# __tile_zero(&b); +# for(int i = 0; i < 10; i++) { +# __tile_dpbssd(&c, a, b); +# if (cond) { +# __tile_zero(&c); +# } else { +# __tile_loadd(&c, buf, STRIDE); +# } +# } +# __tile_stored(buf, STRIDE, c); +# } +--- +name: foo +alignment: 16 +tracksRegLiveness: true +registers: + - { id: 0, class: gr8 } + - { id: 1, class: tile } + - { id: 2, class: tile } + - { id: 3, class: tile } + - { id: 4, class: tile } + - { id: 5, class: tile } + - { id: 6, class: gr32 } + - { id: 7, class: tile } + - { id: 8, class: tile } + - { id: 9, class: tile } + - { id: 10, class: tile } + - { id: 11, class: gr32 } + - { id: 12, class: gr32 } + - { id: 13, class: gr32 } + - { id: 14, class: gr64 } + - { id: 15, class: gr64 } + - { id: 16, class: gr8 } + - { id: 17, class: gr16 } + - { id: 18, class: gr16 } + - { id: 19, class: gr64_nosp } + - { id: 20, class: gr16 } + - { id: 21, class: gr16 } + - { id: 22, class: gr32 } + - { id: 23, class: gr16 } + - { id: 24, class: gr16 } + - { id: 25, class: gr16 } + - { id: 26, class: gr16 } + - { id: 27, class: gr16 } + - { id: 28, class: gr16 } + - { id: 29, class: tile } + - { id: 30, class: gr16 } + - { id: 31, class: gr16 } + - { id: 32, class: gr64_nosp } + - { id: 33, class: gr16 } + - { id: 34, class: gr16 } + - { id: 35, class: gr32 } + - { id: 36, class: gr64_nosp } + - { id: 37, class: gr16 } + - { id: 38, class: gr16 } +liveins: + - { reg: '$edi', virtual-reg: '%12' } + - { reg: '$rsi', virtual-reg: '%14' } +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $edi, $rsi + + %14:gr64 = COPY $rsi + %12:gr32 = COPY $edi + %13:gr32 = COPY killed %12 + %15:gr64 = COPY killed %14 + CMP32ri8 %13, 0, implicit-def $eflags + %16:gr8 = SETCCr 4, implicit $eflags + TEST8ri %16, 1, implicit-def $eflags + JCC_1 %bb.2, 5, implicit $eflags + + bb.1: + %17:gr16 = MOV16ri 64 + %18:gr16 = MOV16ri 16 + %1:tile = PTILEZEROV killed %18, killed %17 + JMP_1 %bb.3 + + bb.2: + %19:gr64_nosp = MOV32ri64 32 + %20:gr16 = MOV16ri 64 + %21:gr16 = MOV16ri 16 + %2:tile = PTILELOADDV killed %21, killed %20, %15, 1, killed %19, 0, $noreg + + bb.3: + ; CHECK: %43:gr16 = PHI %17, %bb.1, %20, %bb.2 + ; CHECK-NEXT: %42:gr16 = PHI %18, %bb.1, %21, %bb.2 + ; CHECK-NEXT: %41:gr64_nosp = PHI %44, %bb.1, %45, %bb.2 + ; CHECK-NEXT: LDTILECFG + + %3:tile = PHI %1, %bb.1, %2, %bb.2 + %25:gr16 = MOV16ri 64 + %26:gr16 = MOV16ri 16 + %4:tile = PTILEZEROV killed %26, killed %25 + %23:gr16 = MOV16ri 64 + %24:gr16 = MOV16ri 16 + %5:tile = PTILEZEROV killed %24, killed %23 + %22:gr32 = MOV32r0 implicit-def $eflags + JMP_1 %bb.5 + + bb.4: + %36:gr64_nosp = MOV32ri64 32 + %37:gr16 = MOV16ri 64 + %38:gr16 = MOV16ri 16 + PTILESTOREDV killed %38, killed %37, %15, 1, killed %36, 0, $noreg, %10 + RET64 + + bb.5: + ; CHECK: %6:gr32 = PHI %22, %bb.3, %35, %bb.8 + ; CHECK-NEXT: %56:gr16 = PHI %43, %bb.3, %60, %bb.8 + ; CHECK-NEXT: %55:gr16 = PHI %42, %bb.3, %59, %bb.8 + ; CHECK-NEXT: %54:gr64_nosp = PHI %57, %bb.3, %64, %bb.8 + ; CHECK-NEXT: LDTILECFG + + %6:gr32 = PHI %22, %bb.3, %35, %bb.8 + %7:tile = PHI %3, %bb.3, %10, %bb.8 + %27:gr16 = MOV16ri 64 + %28:gr16 = MOV16ri 16 + %29:tile = PTDPBSSDV killed %28, %27, %27, %7, %4, %5 + TEST8ri %16, 1, implicit-def $eflags + JCC_1 %bb.7, 5, implicit $eflags + + bb.6: + %30:gr16 = MOV16ri 64 + %31:gr16 = MOV16ri 16 + %8:tile = PTILEZEROV killed %31, killed %30 + JMP_1 %bb.8 + + bb.7: + %32:gr64_nosp = MOV32ri64 32 + %33:gr16 = MOV16ri 64 + %34:gr16 = MOV16ri 16 + %9:tile = PTILELOADDV killed %34, killed %33, %15, 1, killed %32, 0, $noreg + + bb.8: + ; CHECK: %60:gr16 = PHI %30, %bb.6, %33, %bb.7 + ; CHECK-NEXT: %59:gr16 = PHI %31, %bb.6, %34, %bb.7 + ; CHECK-NEXT: %58:gr64_nosp = PHI %61, %bb.6, %62, %bb.7 + ; CHECK-NEXT: LDTILECFG + + %10:tile = PHI %8, %bb.6, %9, %bb.7 + %35:gr32 = ADD32ri8 %6, 1, implicit-def $eflags + CMP32ri8 %35, 10, implicit-def $eflags + JCC_1 %bb.4, 4, implicit $eflags + JMP_1 %bb.5 + +... diff --git a/llvm/test/CodeGen/X86/AMX/amx-fastconfig-spill.mir b/llvm/test/CodeGen/X86/AMX/amx-fastconfig-spill.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/AMX/amx-fastconfig-spill.mir @@ -0,0 +1,154 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=x86_64-- -mattr=+amx-int8,avx512f -run-pass=fastpretileconfig -o - %s | FileCheck %s + +# Test spill/reload across basic block. + +--- +name: foo +alignment: 16 +tracksRegLiveness: true +registers: + - { id: 0, class: gr16 } + - { id: 1, class: gr16 } + - { id: 2, class: tile } + - { id: 3, class: gr64_nosp } + - { id: 4, class: gr64 } + - { id: 5, class: tile } + - { id: 6, class: tile } + - { id: 7, class: tile } + - { id: 8, class: gr32 } + - { id: 9, class: vr512 } +frameInfo: + maxAlignment: 16 +stack: + - { id: 0, size: 1024, alignment: 16 } + - { id: 1, size: 64, alignment: 4 } +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: foo + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0 + ; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.4, align 4) + ; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4) + ; CHECK-NEXT: [[MOV16ri:%[0-9]+]]:gr16 = MOV16ri 32 + ; CHECK-NEXT: [[MOV16ri1:%[0-9]+]]:gr16 = MOV16ri 8 + ; CHECK-NEXT: LDTILECFG %stack.4, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load store (s512) on %stack.4, align 4) + ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[MOV16ri1]], [[MOV16ri]] + ; CHECK-NEXT: [[MOV64ri:%[0-9]+]]:gr64_nosp = MOV64ri 64 + ; CHECK-NEXT: TILESTORED %stack.3, 1, killed [[MOV64ri]], 0, $noreg, [[PTILEZEROV]] :: (store (s8192) into %stack.3) + ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 32 + ; CHECK-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r %stack.0, 1, $noreg, 0, $noreg + ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri1]], [[MOV16ri]], [[LEA64r]], 1, [[MOV32ri64_]], 0, $noreg + ; CHECK-NEXT: [[MOV64ri1:%[0-9]+]]:gr64_nosp = MOV64ri 64 + ; CHECK-NEXT: TILESTORED %stack.2, 1, killed [[MOV64ri1]], 0, $noreg, [[PTILELOADDV]] :: (store (s8192) into %stack.2) + ; CHECK-NEXT: %row:gr16 = MOV16ri 32 + ; CHECK-NEXT: %col:gr16 = MOV16ri 8 + ; CHECK-NEXT: JMP_1 %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: LDTILECFG %stack.4, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load store (s512) on %stack.4, align 4) + ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV %row, %col, [[LEA64r]], 1, [[MOV32ri64_]], 0, $noreg + ; CHECK-NEXT: [[MOV64ri2:%[0-9]+]]:gr64_nosp = MOV64ri 64 + ; CHECK-NEXT: [[PTILELOADDV2:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri1]], [[MOV16ri]], %stack.2, 1, killed [[MOV64ri2]], 0, $noreg :: (load (s8192) from %stack.2) + ; CHECK-NEXT: [[MOV64ri3:%[0-9]+]]:gr64_nosp = MOV64ri 64 + ; CHECK-NEXT: [[PTILELOADDV3:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri1]], [[MOV16ri]], %stack.3, 1, killed [[MOV64ri3]], 0, $noreg :: (load (s8192) from %stack.3) + ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[MOV16ri1]], [[MOV16ri]], [[MOV16ri]], killed [[PTILELOADDV1]], killed [[PTILELOADDV3]], killed [[PTILELOADDV2]] + ; CHECK-NEXT: PTILESTOREDV killed [[MOV16ri1]], killed [[MOV16ri]], killed [[LEA64r]], 1, killed [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]] + ; CHECK-NEXT: [[MOV32r0_:%[0-9]+]]:gr32 = MOV32r0 implicit-def dead $eflags + ; CHECK-NEXT: $eax = COPY killed [[MOV32r0_]] + ; CHECK-NEXT: RET 0, killed $eax + bb.0.entry: + %0:gr16 = MOV16ri 32 + %1:gr16 = MOV16ri 8 + %2:tile = PTILEZEROV %1, %0 + %3:gr64_nosp = MOV32ri64 32 + %4:gr64 = LEA64r %stack.0, 1, $noreg, 0, $noreg + %5:tile = PTILELOADDV %1, %0, %4, 1, %3, 0, $noreg + %row:gr16 = MOV16ri 32 + %col:gr16 = MOV16ri 8 + JMP_1 %bb.1 + bb.1: + %6:tile = PTILELOADDV %row, %col, %4, 1, %3, 0, $noreg + %7:tile = PTDPBSSDV %1, %0, %0, killed %6, killed %2, killed %5 + PTILESTOREDV killed %1, killed %0, killed %4, 1, killed %3, 0, $noreg, killed %7 + %8:gr32 = MOV32r0 implicit-def dead $eflags + $eax = COPY killed %8 + RET 0, killed $eax + +... + +# Test tile copy fold +--- +name: copy +alignment: 16 +tracksRegLiveness: true +registers: + - { id: 0, class: gr16 } + - { id: 1, class: gr16 } + - { id: 2, class: tile } + - { id: 3, class: gr64_nosp } + - { id: 4, class: gr64 } + - { id: 5, class: tile } + - { id: 6, class: tile } + - { id: 7, class: tile } + - { id: 8, class: gr32 } + - { id: 9, class: vr512 } +frameInfo: + maxAlignment: 16 +stack: + - { id: 0, size: 1024, alignment: 16 } + - { id: 1, size: 64, alignment: 4 } +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: copy + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0 + ; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.4, align 4) + ; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4) + ; CHECK-NEXT: [[MOV16ri:%[0-9]+]]:gr16 = MOV16ri 32 + ; CHECK-NEXT: [[MOV16ri1:%[0-9]+]]:gr16 = MOV16ri 8 + ; CHECK-NEXT: LDTILECFG %stack.4, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load store (s512) on %stack.4, align 4) + ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[MOV16ri1]], [[MOV16ri]] + ; CHECK-NEXT: [[MOV64ri:%[0-9]+]]:gr64_nosp = MOV64ri 64 + ; CHECK-NEXT: TILESTORED %stack.3, 1, killed [[MOV64ri]], 0, $noreg, [[PTILEZEROV]] :: (store (s8192) into %stack.3) + ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 32 + ; CHECK-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r %stack.0, 1, $noreg, 0, $noreg + ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri1]], [[MOV16ri]], [[LEA64r]], 1, [[MOV32ri64_]], 0, $noreg + ; CHECK-NEXT: [[MOV64ri1:%[0-9]+]]:gr64_nosp = MOV64ri 64 + ; CHECK-NEXT: TILESTORED %stack.2, 1, killed [[MOV64ri1]], 0, $noreg, [[PTILELOADDV]] :: (store (s8192) into %stack.2) + ; CHECK-NEXT: JMP_1 %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: LDTILECFG %stack.4, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load store (s512) on %stack.4, align 4) + ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri1]], [[MOV16ri]], [[LEA64r]], 1, [[MOV32ri64_]], 0, $noreg + ; CHECK-NEXT: [[MOV64ri2:%[0-9]+]]:gr64_nosp = MOV64ri 64 + ; CHECK-NEXT: %t:tile = PTILELOADDV [[MOV16ri1]], [[MOV16ri]], %stack.2, 1, killed [[MOV64ri2]], 0, $noreg :: (load (s8192) from %stack.2) + ; CHECK-NEXT: [[MOV64ri3:%[0-9]+]]:gr64_nosp = MOV64ri 64 + ; CHECK-NEXT: [[PTILELOADDV2:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri1]], [[MOV16ri]], %stack.3, 1, killed [[MOV64ri3]], 0, $noreg :: (load (s8192) from %stack.3) + ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[MOV16ri1]], [[MOV16ri]], [[MOV16ri]], killed [[PTILELOADDV1]], killed [[PTILELOADDV2]], killed %t + ; CHECK-NEXT: PTILESTOREDV killed [[MOV16ri1]], killed [[MOV16ri]], killed [[LEA64r]], 1, killed [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]] + ; CHECK-NEXT: [[MOV32r0_:%[0-9]+]]:gr32 = MOV32r0 implicit-def dead $eflags + ; CHECK-NEXT: $eax = COPY killed [[MOV32r0_]] + ; CHECK-NEXT: RET 0, killed $eax + bb.0.entry: + %0:gr16 = MOV16ri 32 + %1:gr16 = MOV16ri 8 + %2:tile = PTILEZEROV %1, %0 + %3:gr64_nosp = MOV32ri64 32 + %4:gr64 = LEA64r %stack.0, 1, $noreg, 0, $noreg + %5:tile = PTILELOADDV %1, %0, %4, 1, %3, 0, $noreg + JMP_1 %bb.1 + bb.1: + %6:tile = PTILELOADDV %1, %0, %4, 1, %3, 0, $noreg + %t:tile = COPY %5 + %7:tile = PTDPBSSDV %1, %0, %0, killed %6, killed %2, killed %t + PTILESTOREDV killed %1, killed %0, killed %4, 1, killed %3, 0, $noreg, killed %7 + %8:gr32 = MOV32r0 implicit-def dead $eflags + $eax = COPY killed %8 + RET 0, killed $eax + +... diff --git a/llvm/test/CodeGen/X86/AMX/amx-fastconfig.mir b/llvm/test/CodeGen/X86/AMX/amx-fastconfig.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/AMX/amx-fastconfig.mir @@ -0,0 +1,146 @@ +# RUN: llc -mtriple=x86_64-- -run-pass=fastpretileconfig -o - %s | FileCheck %s + +--- | + target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" + target triple = "x86_64-unknown-unknown" + + @buf = dso_local global [1024 x i8] zeroinitializer, align 16 + @buf2 = dso_local global [1024 x i8] zeroinitializer, align 16 + + define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) local_unnamed_addr #0 { + entry: + %tobool.not = icmp eq i32 %cond, 0 + br i1 %tobool.not, label %if.else, label %if.then + + if.then: ; preds = %entry + %0 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) + %1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) + %2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) + br label %if.end + + if.else: ; preds = %entry + %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) + %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) + %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) + br label %if.end + + if.end: ; preds = %if.else, %if.then + %a.sroa.1094.0.in = phi x86_amx [ %3, %if.else ], [ %0, %if.then ] + %b.sroa.1069.0.in = phi x86_amx [ %4, %if.else ], [ %1, %if.then ] + %c.sroa.1044.0.in = phi x86_amx [ %5, %if.else ], [ %2, %if.then ] + %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %c.sroa.1044.0.in, x86_amx %a.sroa.1094.0.in, x86_amx %b.sroa.1069.0.in) + tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %6) + ret void + } + + declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) #1 + declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #1 + declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) #1 + + attributes #0 = { "target-features"="+amx-int8,+avx512f" } + attributes #1 = { nounwind "target-features"="+amx-int8,+avx512f" } + +... +--- +name: test_api +alignment: 16 +tracksRegLiveness: true +registers: + - { id: 0, class: tile } + - { id: 1, class: tile } + - { id: 2, class: tile } + - { id: 3, class: tile } + - { id: 4, class: tile } + - { id: 5, class: tile } + - { id: 6, class: tile } + - { id: 7, class: tile } + - { id: 8, class: tile } + - { id: 9, class: gr32 } + - { id: 10, class: gr32 } + - { id: 11, class: gr32 } + - { id: 12, class: gr16 } + - { id: 13, class: gr16 } + - { id: 14, class: gr64 } + - { id: 15, class: gr64_nosp } + - { id: 16, class: gr16 } + - { id: 17, class: gr64 } + - { id: 18, class: gr64_nosp } + - { id: 19, class: gr16 } + - { id: 20, class: gr16 } + - { id: 21, class: tile } + - { id: 22, class: gr64 } + - { id: 23, class: gr64_nosp } +liveins: + - { reg: '$edi', virtual-reg: '%9' } + - { reg: '$esi', virtual-reg: '%10' } + - { reg: '$edx', virtual-reg: '%11' } +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0.entry: + successors: %bb.2(0x30000000), %bb.1(0x50000000) + liveins: $edi, $esi, $edx + + ; CHECK: {{%.*}}:vr512 = AVX512_512_SET0 + ; CHECK-NEXT: VMOVUPSZmr %stack.3, 1, $noreg, 0, $noreg, {{%.*}} + + %11:gr32 = COPY killed $edx + %10:gr32 = COPY killed $esi + %9:gr32 = COPY killed $edi + %13:gr16 = COPY killed %11.sub_16bit + %12:gr16 = COPY killed %10.sub_16bit + TEST32rr killed %9, %9, implicit-def $eflags + JCC_1 %bb.2, 4, implicit killed $eflags + JMP_1 %bb.1 + + bb.1.if.then: + %14:gr64 = MOV32ri64 @buf + %15:gr64_nosp = MOV32ri64 32 + %16:gr16 = MOV16ri 8 + ; CHECK: LDTILECFG + %0:tile = PTILELOADDV %12, %16, %14, 1, %15, 0, $noreg + %1:tile = PTILELOADDV killed %16, %13, %14, 1, %15, 0, $noreg + %2:tile = PTILELOADDV %12, %13, killed %14, 1, killed %15, 0, $noreg + JMP_1 %bb.3 + + bb.2.if.else: + %17:gr64 = MOV32ri64 @buf2 + %18:gr64_nosp = MOV32ri64 32 + %19:gr16 = MOV16ri 8 + ; CHECK: LDTILECFG + %3:tile = PTILELOADDV %12, %19, %17, 1, %18, 0, $noreg + %4:tile = PTILELOADDV killed %19, %13, %17, 1, %18, 0, $noreg + %5:tile = PTILELOADDV %12, %13, killed %17, 1, killed %18, 0, $noreg + + bb.3.if.end: + + ; CHECK: bb.3.if.end + ; CHECK-NEXT: %44:gr16 = PHI %16, %bb.1, %19, %bb.2 + ; CHECK-NEXT: %43:gr16 = PHI %12, %bb.1, %12, %bb.2 + ; CHECK-NEXT: %42:gr64_nosp = PHI %45, %bb.1, %46, %bb.2 + ; CHECK-NEXT: %38:gr16 = PHI %13, %bb.1, %13, %bb.2 + ; CHECK-NEXT: %37:gr16 = PHI %16, %bb.1, %19, %bb.2 + ; CHECK-NEXT: %36:gr64_nosp = PHI %39, %bb.1, %40, %bb.2 + ; CHECK-NEXT: %32:gr16 = PHI %13, %bb.1, %13, %bb.2 + ; CHECK-NEXT: %31:gr16 = PHI %12, %bb.1, %12, %bb.2 + ; CHECK-NEXT: %30:gr64_nosp = PHI %33, %bb.1, %34, %bb.2 + ; CHECK-NEXT: LDTILECFG + ; CHECK-NEXT: %47:gr64_nosp = MOV64ri 64 + ; CHECK-NEXT: %6:tile = PTILELOADDV %43, %44, %42, 1, killed %47, 0, $noreg + ; CHECK-NEXT: %41:gr64_nosp = MOV64ri 64 + ; CHECK-NEXT: %7:tile = PTILELOADDV %37, %38, %36, 1, killed %41, 0, $noreg + ; CHECK-NEXT: %35:gr64_nosp = MOV64ri 64 + ; CHECK-NEXT: %8:tile = PTILELOADDV %31, %32, %30, 1, killed %35, 0, $noreg + + %6:tile = PHI %0, %bb.1, %3, %bb.2 + %7:tile = PHI %1, %bb.1, %4, %bb.2 + %8:tile = PHI %2, %bb.1, %5, %bb.2 + %20:gr16 = MOV16ri 8 + %21:tile = PTDPBSSDV %12, %13, killed %20, killed %8, killed %6, killed %7 + %22:gr64 = MOV32ri64 @buf + %23:gr64_nosp = MOV32ri64 32 + PTILESTOREDV killed %12, killed %13, killed %22, 1, killed %23, 0, $noreg, killed %21 + RET 0 + +... diff --git a/llvm/test/CodeGen/X86/AMX/amx-fastpreconfig.mir b/llvm/test/CodeGen/X86/AMX/amx-fastpreconfig.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/AMX/amx-fastpreconfig.mir @@ -0,0 +1,61 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=x86_64-- -mattr=+amx-int8,avx512f -run-pass=fastpretileconfig -o - %s | FileCheck %s + +# Test the case which has TILELOADD being mixed in psuedo AMX instruction +... +--- +name: main +alignment: 16 +tracksRegLiveness: true +registers: + - { id: 0, class: gr64_nosp } + - { id: 1, class: gr64 } + - { id: 2, class: gr16 } + - { id: 3, class: gr16 } + - { id: 4, class: tile } + - { id: 5, class: tile } + - { id: 6, class: tile } + - { id: 7, class: tile } + - { id: 8, class: gr32 } + - { id: 9, class: vr512 } +frameInfo: + maxAlignment: 16 +stack: + - { id: 0, size: 1024, alignment: 16 } + - { id: 1, size: 64, alignment: 4 } +machineFunctionInfo: {} +body: | + bb.0.entry: + ; CHECK-LABEL: name: main + ; CHECK: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0 + ; CHECK-NEXT: VMOVUPSZmr %stack.2, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.2, align 4) + ; CHECK-NEXT: MOV8mi %stack.2, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.2, align 4) + ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 32 + ; CHECK-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r %stack.0, 1, $noreg, 0, $noreg + ; CHECK-NEXT: [[MOV16ri:%[0-9]+]]:gr16 = MOV16ri 32 + ; CHECK-NEXT: [[MOV16ri1:%[0-9]+]]:gr16 = MOV16ri 8 + ; CHECK-NEXT: LDTILECFG %stack.2, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load store (s512) on %stack.2, align 4) + ; CHECK-NEXT: $tmm0 = TILELOADD [[LEA64r]], 1, [[MOV32ri64_]], 0, $noreg + ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri1]], [[MOV16ri]], [[LEA64r]], 1, [[MOV32ri64_]], 0, $noreg + ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri1]], [[MOV16ri]], [[LEA64r]], 1, [[MOV32ri64_]], 0, $noreg + ; CHECK-NEXT: [[PTILELOADDV2:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri1]], [[MOV16ri]], [[LEA64r]], 1, [[MOV32ri64_]], 0, $noreg + ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[MOV16ri1]], [[MOV16ri]], [[MOV16ri]], killed [[PTILELOADDV2]], killed [[PTILELOADDV]], killed [[PTILELOADDV1]] + ; CHECK-NEXT: PTILESTOREDV killed [[MOV16ri1]], killed [[MOV16ri]], killed [[LEA64r]], 1, killed [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]] + ; CHECK-NEXT: [[MOV32r0_:%[0-9]+]]:gr32 = MOV32r0 implicit-def dead $eflags + ; CHECK-NEXT: $eax = COPY killed [[MOV32r0_]] + ; CHECK-NEXT: RET 0, killed $eax + %0:gr64_nosp = MOV32ri64 32 + %1:gr64 = LEA64r %stack.0, 1, $noreg, 0, $noreg + %2:gr16 = MOV16ri 32 + %3:gr16 = MOV16ri 8 + $tmm0 = TILELOADD %1, 1, %0, 0, $noreg + %4:tile = PTILELOADDV %3, %2, %1, 1, %0, 0, $noreg + %5:tile = PTILELOADDV %3, %2, %1, 1, %0, 0, $noreg + %6:tile = PTILELOADDV %3, %2, %1, 1, %0, 0, $noreg + %7:tile = PTDPBSSDV %3, %2, %2, killed %6, killed %4, killed %5 + PTILESTOREDV killed %3, killed %2, killed %1, 1, killed %0, 0, $noreg, killed %7 + %8:gr32 = MOV32r0 implicit-def dead $eflags + $eax = COPY killed %8 + RET 0, killed $eax + +... diff --git a/llvm/test/CodeGen/X86/AMX/amx-zero-config.ll b/llvm/test/CodeGen/X86/AMX/amx-zero-config.ll --- a/llvm/test/CodeGen/X86/AMX/amx-zero-config.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-zero-config.ll @@ -66,30 +66,27 @@ ; AVX512-O0-NEXT: pushq %rbp ; AVX512-O0-NEXT: movq %rsp, %rbp ; AVX512-O0-NEXT: andq $-1024, %rsp # imm = 0xFC00 -; AVX512-O0-NEXT: subq $2048, %rsp # imm = 0x800 -; AVX512-O0-NEXT: movq %rsp, %rdx +; AVX512-O0-NEXT: subq $3072, %rsp # imm = 0xC00 ; AVX512-O0-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-O0-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) +; AVX512-O0-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) ; AVX512-O0-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-O0-NEXT: movb $8, {{[0-9]+}}(%rsp) -; AVX512-O0-NEXT: movw $32, {{[0-9]+}}(%rsp) -; AVX512-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AVX512-O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx ; AVX512-O0-NEXT: movw $32, %cx ; AVX512-O0-NEXT: movw $8, %ax +; AVX512-O0-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX512-O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AVX512-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; AVX512-O0-NEXT: tilezero %tmm0 ; AVX512-O0-NEXT: movl $64, %esi +; AVX512-O0-NEXT: movw $32, %cx +; AVX512-O0-NEXT: movw $8, %ax ; AVX512-O0-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX512-O0-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; AVX512-O0-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-O0-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-O0-NEXT: movw $8, %cx -; AVX512-O0-NEXT: # kill: def $cl killed $cl killed $cx -; AVX512-O0-NEXT: movb %cl, {{[0-9]+}}(%rsp) -; AVX512-O0-NEXT: movw $32, {{[0-9]+}}(%rsp) -; AVX512-O0-NEXT: ldtilecfg (%rax) ; AVX512-O0-NEXT: movl $64, %esi ; AVX512-O0-NEXT: movw $32, %cx ; AVX512-O0-NEXT: movw $8, %ax +; AVX512-O0-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX512-O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AVX512-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; AVX512-O0-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; AVX512-O0-NEXT: movl $1024, %edx # imm = 0x400 ; AVX512-O0-NEXT: movw $32, %cx @@ -106,32 +103,28 @@ ; AVX2-O0-NEXT: pushq %rbp ; AVX2-O0-NEXT: movq %rsp, %rbp ; AVX2-O0-NEXT: andq $-1024, %rsp # imm = 0xFC00 -; AVX2-O0-NEXT: subq $2048, %rsp # imm = 0x800 -; AVX2-O0-NEXT: movq %rsp, %rdx +; AVX2-O0-NEXT: subq $3072, %rsp # imm = 0xC00 ; AVX2-O0-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX2-O0-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) ; AVX2-O0-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) ; AVX2-O0-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-O0-NEXT: movb $8, {{[0-9]+}}(%rsp) -; AVX2-O0-NEXT: movw $32, {{[0-9]+}}(%rsp) -; AVX2-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AVX2-O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx ; AVX2-O0-NEXT: movw $32, %cx ; AVX2-O0-NEXT: movw $8, %ax +; AVX2-O0-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AVX2-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; AVX2-O0-NEXT: tilezero %tmm0 ; AVX2-O0-NEXT: movl $64, %esi +; AVX2-O0-NEXT: movw $32, %cx +; AVX2-O0-NEXT: movw $8, %ax ; AVX2-O0-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX2-O0-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; AVX2-O0-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-O0-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-O0-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-O0-NEXT: movw $8, %cx -; AVX2-O0-NEXT: # kill: def $cl killed $cl killed $cx -; AVX2-O0-NEXT: movb %cl, {{[0-9]+}}(%rsp) -; AVX2-O0-NEXT: movw $32, {{[0-9]+}}(%rsp) -; AVX2-O0-NEXT: ldtilecfg (%rax) ; AVX2-O0-NEXT: movl $64, %esi ; AVX2-O0-NEXT: movw $32, %cx ; AVX2-O0-NEXT: movw $8, %ax +; AVX2-O0-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AVX2-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; AVX2-O0-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; AVX2-O0-NEXT: movl $1024, %edx # imm = 0x400 ; AVX2-O0-NEXT: movw $32, %cx @@ -148,36 +141,30 @@ ; SSE2-O0-NEXT: pushq %rbp ; SSE2-O0-NEXT: movq %rsp, %rbp ; SSE2-O0-NEXT: andq $-1024, %rsp # imm = 0xFC00 -; SSE2-O0-NEXT: subq $2048, %rsp # imm = 0x800 -; SSE2-O0-NEXT: movq %rsp, %rdx +; SSE2-O0-NEXT: subq $3072, %rsp # imm = 0xC00 ; SSE2-O0-NEXT: xorps %xmm0, %xmm0 ; SSE2-O0-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) ; SSE2-O0-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) ; SSE2-O0-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) ; SSE2-O0-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) ; SSE2-O0-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-O0-NEXT: movb $8, {{[0-9]+}}(%rsp) -; SSE2-O0-NEXT: movw $32, {{[0-9]+}}(%rsp) -; SSE2-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; SSE2-O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx ; SSE2-O0-NEXT: movw $32, %cx ; SSE2-O0-NEXT: movw $8, %ax +; SSE2-O0-NEXT: movb %al, {{[0-9]+}}(%rsp) +; SSE2-O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; SSE2-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; SSE2-O0-NEXT: tilezero %tmm0 ; SSE2-O0-NEXT: movl $64, %esi +; SSE2-O0-NEXT: movw $32, %cx +; SSE2-O0-NEXT: movw $8, %ax ; SSE2-O0-NEXT: tilestored %tmm0, (%rdx,%rsi) -; SSE2-O0-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; SSE2-O0-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-O0-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-O0-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-O0-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-O0-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-O0-NEXT: movw $8, %cx -; SSE2-O0-NEXT: # kill: def $cl killed $cl killed $cx -; SSE2-O0-NEXT: movb %cl, {{[0-9]+}}(%rsp) -; SSE2-O0-NEXT: movw $32, {{[0-9]+}}(%rsp) -; SSE2-O0-NEXT: ldtilecfg (%rax) ; SSE2-O0-NEXT: movl $64, %esi ; SSE2-O0-NEXT: movw $32, %cx ; SSE2-O0-NEXT: movw $8, %ax +; SSE2-O0-NEXT: movb %al, {{[0-9]+}}(%rsp) +; SSE2-O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; SSE2-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; SSE2-O0-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; SSE2-O0-NEXT: movl $1024, %edx # imm = 0x400 ; SSE2-O0-NEXT: movw $32, %cx diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -20,7 +20,6 @@ ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Lower AMX intrinsics ; CHECK-NEXT: Lower AMX type for load/store -; CHECK-NEXT: Pre AMX Tile Config ; CHECK-NEXT: Module Verifier ; CHECK-NEXT: Lower Garbage Collection Instructions ; CHECK-NEXT: Shadow Stack GC Lowering @@ -42,6 +41,7 @@ ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: X86 EFLAGS copy lowering ; CHECK-NEXT: X86 DynAlloca Expander +; CHECK-NEXT: Fast Tile Register Preconfigure ; CHECK-NEXT: Eliminate PHI nodes for register allocation ; CHECK-NEXT: Two-Address instruction pass ; CHECK-NEXT: Fast Register Allocator