diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -38,6 +38,7 @@ X86PreAMXConfig.cpp X86LowerAMXIntrinsics.cpp X86TileConfig.cpp + X86FastPreTileConfig.cpp X86FastTileConfig.cpp X86PreTileConfig.cpp X86ExpandPseudo.cpp diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -79,6 +79,9 @@ /// Return a pass that config the tile registers. FunctionPass *createX86TileConfigPass(); +/// Return a pass that preconfig the tile registers before fast reg allocation. +FunctionPass *createX86FastPreTileConfigPass(); + /// Return a pass that config the tile registers after fast reg allocation. FunctionPass *createX86FastTileConfigPass(); @@ -175,6 +178,7 @@ void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &); void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &); void initializeX86PreTileConfigPass(PassRegistry &); +void initializeX86FastPreTileConfigPass(PassRegistry &); void initializeX86FastTileConfigPass(PassRegistry &); void initializeX86TileConfigPass(PassRegistry &); void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &); diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp @@ -0,0 +1,471 @@ +//===-- X86FastPreTileConfig.cpp - Fast Tile Register Configure------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file Pass to preconfig the shape of physical tile registers +/// It inserts ldtilecfg ahead of each group of tile registers. The algorithm +/// walk each instruction of basic block in reverse order. All the tile +/// registers that live out the basic block would be spilled and reloaded +/// before its user. It also check the depenedency of the shape to ensure +/// the shape is defined before ldtilecfg. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86MachineFunctionInfo.h" +#include "X86RegisterInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "fastpretileconfig" + +STATISTIC(NumStores, "Number of stores added"); +STATISTIC(NumLoads, "Number of loads added"); + +namespace { + +class X86FastPreTileConfig : public MachineFunctionPass { + // context + MachineFunction *MF = nullptr; + const X86Subtarget *ST = nullptr; + const TargetInstrInfo *TII = nullptr; + MachineRegisterInfo *MRI = nullptr; + X86MachineFunctionInfo *X86FI = nullptr; + MachineFrameInfo *MFI = nullptr; + const TargetRegisterInfo *TRI = nullptr; + MachineBasicBlock *MBB = nullptr; + int CfgSS = -1; + + /// Maps virtual regs to the frame index where these values are spilled. + IndexedMap StackSlotForVirtReg; + + int getStackSpaceFor(Register VirtReg); + void InitializeTileConfigStackSpace(); + void spill(MachineBasicBlock::iterator Before, Register VirtReg, bool Kill); + void reload(MachineBasicBlock::iterator UseMI, Register VirtReg, + MachineOperand *RowMO, MachineOperand *ColMO); + void convertPHI(MachineInstr &PHI); + bool configBasicBlock(MachineBasicBlock &MBB); + +public: + X86FastPreTileConfig() : MachineFunctionPass(ID), StackSlotForVirtReg(-1) {} + + /// Return the pass name. + StringRef getPassName() const override { + return "Fast Tile Register Preconfigure"; + } + + /// Perform tile register configure. + bool runOnMachineFunction(MachineFunction &MFunc) override; + + static char ID; +}; + +} // end anonymous namespace + +char X86FastPreTileConfig::ID = 0; + +INITIALIZE_PASS_BEGIN(X86FastPreTileConfig, DEBUG_TYPE, + "Fast Tile Register Preconfigure", false, false) +INITIALIZE_PASS_END(X86FastPreTileConfig, DEBUG_TYPE, + "Fast Tile Register Preconfigure", false, false) + +/// This allocates space for the specified virtual register to be held on the +/// stack. +int X86FastPreTileConfig::getStackSpaceFor(Register VirtReg) { + // Find the location Reg would belong... + int SS = StackSlotForVirtReg[VirtReg]; + // Already has space allocated? + if (SS != -1) + return SS; + + // Allocate a new stack object for this spill location... + const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); + unsigned Size = TRI->getSpillSize(RC); + Align Alignment = TRI->getSpillAlign(RC); + int FrameIdx = MFI->CreateSpillStackObject(Size, Alignment); + + // Assign the slot. + StackSlotForVirtReg[VirtReg] = FrameIdx; + return FrameIdx; +} + +void X86FastPreTileConfig::InitializeTileConfigStackSpace() { + MachineBasicBlock &MBB = MF->front(); + MachineInstr *MI = &*MBB.getFirstNonPHI(); + DebugLoc DL; + if (ST->hasAVX512()) { + Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass); + BuildMI(MBB, MI, DL, TII->get(X86::AVX512_512_SET0), Zmm); + addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSZmr)), CfgSS) + .addReg(Zmm); + } else if (ST->hasAVX2()) { + Register Ymm = MRI->createVirtualRegister(&X86::VR256RegClass); + BuildMI(MBB, MI, DL, TII->get(X86::AVX_SET0), Ymm); + addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSYmr)), CfgSS) + .addReg(Ymm); + addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSYmr)), CfgSS, + 32) + .addReg(Ymm); + } else { + assert(ST->hasSSE2() && "AMX should assume SSE2 enabled"); + unsigned StoreOpc = ST->hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr; + Register Xmm = MRI->createVirtualRegister(&X86::VR128RegClass); + BuildMI(MBB, MI, DL, TII->get(X86::V_SET0), Xmm); + addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), CfgSS) + .addReg(Xmm); + addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), CfgSS, 16) + .addReg(Xmm); + addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), CfgSS, 32) + .addReg(Xmm); + addFrameReference(BuildMI(MBB, MI, DL, TII->get(StoreOpc)), CfgSS, 48) + .addReg(Xmm); + } + // Fill in the palette first. + addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV8mi)), CfgSS) + .addImm(1); +} + +static bool dominates(MachineBasicBlock &MBB, + MachineBasicBlock::const_iterator A, + MachineBasicBlock::const_iterator B) { + auto MBBEnd = MBB.end(); + if (B == MBBEnd) + return true; + + MachineBasicBlock::const_iterator I = MBB.begin(); + for (; &*I != A && &*I != B; ++I) + ; + + return &*I == A; +} + +/// Insert spill instruction for \p AssignedReg before \p Before. +/// TODO: Update DBG_VALUEs with \p VirtReg operands with the stack slot. +void X86FastPreTileConfig::spill(MachineBasicBlock::iterator Before, + Register VirtReg, bool Kill) { + LLVM_DEBUG(dbgs() << "Spilling " << printReg(VirtReg, TRI) << " \n"); + int FI = getStackSpaceFor(VirtReg); + LLVM_DEBUG(dbgs() << " to stack slot #" << FI << '\n'); + + const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); + // Don't need shape information for tile store, becasue it is adjacent to + // the tile def instruction. + TII->storeRegToStackSlot(*MBB, Before, VirtReg, Kill, FI, &RC, TRI); + ++NumStores; + + // TODO: update DBG_VALUEs +} + +/// Insert reload instruction for \p PhysReg before \p Before. +void X86FastPreTileConfig::reload(MachineBasicBlock::iterator UseMI, + Register OrigReg, MachineOperand *RowMO, + MachineOperand *ColMO) { + int FI = getStackSpaceFor(OrigReg); + const TargetRegisterClass &RC = *MRI->getRegClass(OrigReg); + Register TileReg; + // Fold copy to tileload + // BB1: + // spill src to s + // + // BB2: + // t = copy src + // --> + // t = tileload (s) + if (UseMI->isCopy()) + TileReg = UseMI->getOperand(0).getReg(); + else + TileReg = MRI->createVirtualRegister(&RC); + // Can't use TII->loadRegFromStackSlot(), because we need the shape + // information for reload. + // tileloadd (%sp, %idx), %tmm + unsigned Opc = X86::PTILELOADDV; + Register StrideReg = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass); + // FIXME: MBB is not the parent of UseMI. + MachineInstr *NewMI = BuildMI(*UseMI->getParent(), UseMI, DebugLoc(), + TII->get(X86::MOV64ri), StrideReg) + .addImm(64); + NewMI = addFrameReference( + BuildMI(*UseMI->getParent(), UseMI, DebugLoc(), TII->get(Opc), TileReg) + .addReg(RowMO->getReg()) + .addReg(ColMO->getReg()), + FI); + MachineOperand &MO = NewMI->getOperand(5); + MO.setReg(StrideReg); + MO.setIsKill(true); + RowMO->setIsKill(false); + ColMO->setIsKill(false); + // Erase copy instruction after it is folded. + if (UseMI->isCopy()) { + UseMI->eraseFromParent(); + } else { + // Replace the register in the user MI. + for (auto &MO : UseMI->operands()) { + if (MO.isReg() && MO.getReg() == OrigReg) + MO.setReg(TileReg); + } + } + + ++NumLoads; + LLVM_DEBUG(dbgs() << "Reloading " << printReg(OrigReg, TRI) << " into " + << printReg(TileReg, TRI) << '\n'); +} + +static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) { + // The instruction must have 3 operands: tile def, row, col. + if (MI.isDebugInstr() || MI.getNumOperands() < 3 || !MI.isPseudo()) + return false; + MachineOperand &MO = MI.getOperand(0); + + if (MO.isReg()) { + Register Reg = MO.getReg(); + // FIXME it may be used after Greedy RA and the physical + // register is not rewritten yet. + if (Reg.isVirtual() && + MRI->getRegClass(Reg)->getID() == X86::TILERegClassID) + return true; + if (Reg >= X86::TMM0 && Reg <= X86::TMM7) + return true; + } + + return false; +} + +static ShapeT getShape(MachineRegisterInfo *MRI, Register TileReg) { + MachineInstr *MI = MRI->getVRegDef(TileReg); + if (isTileDef(MRI, *MI)) { + MachineOperand *RowMO = &MI->getOperand(1); + MachineOperand *ColMO = &MI->getOperand(2); + return ShapeT(RowMO, ColMO, MRI); + } else if (MI->isCopy()) { + TileReg = MI->getOperand(1).getReg(); + return getShape(MRI, TileReg); + } + + // The def should not be PHI node, because we walk the MBB in reverse post + // order. + assert(MI->isPHI() && "Unexpected PHI when get shape."); + llvm_unreachable("Unexpected MI when get shape."); +} + +// BB0: +// spill t0 to s0 +// BB1: +// spill t1 to s1 +// +// BB2: +// t = phi [t0, bb0] [t1, bb1] +// --> +// row = phi [r0, bb0] [r1, bb1] +// col = phi [c0, bb0] [c1, bb1] +// s = phi [s0, bb0] [s1, bb1] +// t = tileload row, col, s +// The new instruction is inserted at the end of the phi node. The order +// of the original phi node is not ensured. +void X86FastPreTileConfig::convertPHI(MachineInstr &PHI) { + // 1. Create instruction to get stack slot address of each incoming block. + // 2. Create PHI node for the stack address. + // 3. Create PHI node for shape. If one of the incoming shape is immediate + // use the immediate and delete the PHI node. + // 4. Create tileload instruction from the stack address. + Register StackAddrReg = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass); + MachineInstrBuilder AddrPHI = BuildMI(*MBB, ++PHI.getIterator(), DebugLoc(), + TII->get(X86::PHI), StackAddrReg); + Register RowReg = MRI->createVirtualRegister(&X86::GR16RegClass); + MachineInstrBuilder RowPHI = BuildMI(*MBB, ++PHI.getIterator(), DebugLoc(), + TII->get(X86::PHI), RowReg); + Register ColReg = MRI->createVirtualRegister(&X86::GR16RegClass); + MachineInstrBuilder ColPHI = BuildMI(*MBB, ++PHI.getIterator(), DebugLoc(), + TII->get(X86::PHI), ColReg); + for (unsigned I = 1, E = PHI.getNumOperands(); I != E; I += 2) { + Register InTileReg = PHI.getOperand(I).getReg(); + MachineBasicBlock *InMBB = PHI.getOperand(I + 1).getMBB(); + int FI = getStackSpaceFor(InTileReg); + Register InStackAddrReg = + MRI->createVirtualRegister(&X86::GR64_NOSPRegClass); + MachineInstr *TileDefMI = MRI->getVRegDef(InTileReg); + addOffset(BuildMI(*TileDefMI->getParent(), TileDefMI, DebugLoc(), + TII->get(X86::LEA64r), InStackAddrReg) + .addFrameIndex(FI), + 0); + AddrPHI.addReg(InStackAddrReg).addMBB(InMBB); + ShapeT Shape = getShape(MRI, InTileReg); + Shape.getRow()->setIsKill(false); + Shape.getCol()->setIsKill(false); + RowPHI.addReg(Shape.getRow()->getReg()).addMBB(InMBB); + ColPHI.addReg(Shape.getCol()->getReg()).addMBB(InMBB); + } + + MachineBasicBlock::iterator InsertPos = MBB->getFirstNonPHI(); + Register StrideReg = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass); + BuildMI(*MBB, InsertPos, DebugLoc(), TII->get(X86::MOV64ri), StrideReg) + .addImm(64); + Register TileReg = PHI.getOperand(0).getReg(); + MachineInstr *NewMI = addDirectMem( + BuildMI(*MBB, InsertPos, DebugLoc(), TII->get(X86::PTILELOADDV), TileReg) + .addReg(RowReg) + .addReg(ColReg), + StackAddrReg); + MachineOperand &MO = NewMI->getOperand(5); + MO.setReg(StrideReg); + MO.setIsKill(true); +} + +// PreTileConfig should configure the tile registers based on basic +// block. +bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) { + this->MBB = &MBB; + bool Change = false; + auto DefTileReg = [&](MachineInstr &MI) { + MachineOperand &MO = MI.getOperand(0); + if (MO.isReg() && MO.getReg().isVirtual() && + MRI->getRegClass(MO.getReg())->getID() == X86::TILERegClassID) + return true; + return false; + }; + + MachineInstr *LastShapeMI = nullptr; + MachineInstr *LastTileCfg = nullptr; + SmallVector DeadPHIs; + for (MachineInstr &MI : reverse(MBB)) { + if (MI.isPHI() && DefTileReg(MI)) { + convertPHI(MI); + DeadPHIs.push_back(&MI); + continue; + } + if (!isTileDef(MRI, MI)) + continue; + Change = true; + // If MI dominate the last shape def instruction, we need insert + // ldtilecfg after LastShapeMI now. The config doesn't include + // current MI. + // tilezero + // ldtilecfg <- insert + // def row + // def col + // tilezero(row, col) + if (LastShapeMI && dominates(MBB, MI, LastShapeMI)) { + auto II = LastShapeMI->getIterator(); + if (CfgSS == -1) + CfgSS = MFI->CreateStackObject(ST->getTileConfigSize(), + ST->getTileConfigAlignment(), false); + LastTileCfg = addFrameReference( + BuildMI(MBB, ++II, DebugLoc(), TII->get(X86::LDTILECFG)), CfgSS); + LastShapeMI = nullptr; + } + MachineOperand *RowMO = &MI.getOperand(1); + MachineOperand *ColMO = &MI.getOperand(2); + MachineInstr *RowMI = MRI->getVRegDef(RowMO->getReg()); + MachineInstr *ColMI = MRI->getVRegDef(ColMO->getReg()); + // If the shape is defined in current MBB, check the domination. + // FIXME how about loop? + if (RowMI->getParent() == &MBB) { + if (!LastShapeMI) + LastShapeMI = RowMI; + else if (dominates(MBB, LastShapeMI, RowMI)) + LastShapeMI = RowMI; + } + if (ColMI->getParent() == &MBB) { + if (!LastShapeMI) + LastShapeMI = ColMI; + else if (dominates(MBB, LastShapeMI, ColMI)) + LastShapeMI = ColMI; + } + // If there is user live out of the MBB, spill it and reload in before the + // user. + bool Spilled = false; + Register TileReg = MI.getOperand(0).getReg(); + for (MachineInstr &UseMI : MRI->use_instructions(TileReg)) { + if (UseMI.getParent() == &MBB) { + // check user should not across ldtilecfg + if (!LastTileCfg || !dominates(MBB, LastTileCfg, UseMI)) + continue; + if (!Spilled) { + spill(++MI.getIterator(), TileReg, false); + Spilled = true; + } + // reload befor UseMI + reload(UseMI.getIterator(), TileReg, RowMO, ColMO); + } else { + if (!Spilled) { + spill(++MI.getIterator(), TileReg, false); + Spilled = true; + } + // Don't reload for phi instruction, we handle phi reload separately. + // TODO: merge the reload for the same user MBB. + if (!UseMI.isPHI()) + reload(UseMI.getIterator(), TileReg, RowMO, ColMO); + } + } + } + + for (MachineInstr *DeadPHI : DeadPHIs) + DeadPHI->eraseFromParent(); + // Configure tile registers at the head of the MBB + if (Change) { + MachineInstr *Before; + if (LastShapeMI == nullptr) + Before = &*MBB.getFirstNonPHI(); + else + Before = &*(++LastShapeMI->getIterator()); + + if (CfgSS == -1) + CfgSS = MFI->CreateStackObject(ST->getTileConfigSize(), + ST->getTileConfigAlignment(), false); + addFrameReference( + BuildMI(MBB, Before, DebugLoc(), TII->get(X86::LDTILECFG)), CfgSS); + } + + return Change; +} + +bool X86FastPreTileConfig::runOnMachineFunction(MachineFunction &MFunc) { + MF = &MFunc; + MRI = &MFunc.getRegInfo(); + ST = &MFunc.getSubtarget(); + TII = ST->getInstrInfo(); + X86FI = MFunc.getInfo(); + MFI = &MFunc.getFrameInfo(); + TRI = ST->getRegisterInfo(); + CfgSS = -1; + + unsigned NumVirtRegs = MRI->getNumVirtRegs(); + StackSlotForVirtReg.resize(NumVirtRegs); + bool Change = false; + + assert(MRI->isSSA()); + // Loop over all of the basic blocks in reverse post order and insert + // ldtilecfg for tile registers. The reserse post order is to facilitate + // PHI node convert. + ReversePostOrderTraversal RPOT(MF); + for (MachineBasicBlock *MBB : RPOT) + Change |= configBasicBlock(*MBB); + + if (Change) + InitializeTileConfigStackSpace(); + + StackSlotForVirtReg.clear(); + return Change; +} + +FunctionPass *llvm::createX86FastPreTileConfigPass() { + return new X86FastPreTileConfig(); +} diff --git a/llvm/lib/Target/X86/X86FastTileConfig.cpp b/llvm/lib/Target/X86/X86FastTileConfig.cpp --- a/llvm/lib/Target/X86/X86FastTileConfig.cpp +++ b/llvm/lib/Target/X86/X86FastTileConfig.cpp @@ -40,40 +40,25 @@ class X86FastTileConfig : public MachineFunctionPass { // context MachineFunction *MF = nullptr; - const X86Subtarget *ST = nullptr; - const TargetRegisterInfo *TRI = nullptr; const TargetInstrInfo *TII = nullptr; MachineRegisterInfo *MRI = nullptr; + const TargetRegisterInfo *TRI = nullptr; X86MachineFunctionInfo *X86FI = nullptr; - MachineInstr *getTileConfigPoint(); - void tileConfig(); + bool configBasicBlock(MachineBasicBlock &MBB); public: X86FastTileConfig() : MachineFunctionPass(ID) {} - bool fastTileConfig(); - bool isTileLoad(MachineInstr &MI); - bool isTileStore(MachineInstr &MI); - bool isAMXInstr(MachineInstr &MI); - - MachineInstr *getKeyAMXInstr(MachineInstr *MI); - void getTileShapesCfg(MachineInstr *MI, - SmallVector &ShapedTiles); - void getShapeCfgInstrs(MachineInstr *MI, - std::map &RowCfgs, - std::map &ColCfgs); - /// Return the pass name. StringRef getPassName() const override { return "Fast Tile Register Configure"; } - void materializeTileCfg(MachineInstr *MI); - - void rewriteTileCfg(SmallVector &ShapedTiles, - std::map &RowCfgs, - std::map &ColCfgs); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } /// Perform register allocation. bool runOnMachineFunction(MachineFunction &MFunc) override; @@ -95,210 +80,105 @@ INITIALIZE_PASS_END(X86FastTileConfig, DEBUG_TYPE, "Fast Tile Register Configure", false, false) -static bool isTilePhysReg(MachineOperand &Op) { - if (!Op.isReg()) +static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) { + // There is no phi instruction after register allocation. + assert(MI.isPHI() == false); + // The instruction must have 3 operands: tile def, row, col. + // It should be AMX pseudo instruction that have shape operand. + if (MI.isDebugInstr() || MI.isCopy() || MI.getNumOperands() < 3 || + !MI.isPseudo()) return false; + MachineOperand &MO = MI.getOperand(0); + + if (MO.isReg()) { + Register Reg = MO.getReg(); + // FIXME it may be used after Greedy RA and the physical + // register is not rewritten yet. + if (Reg.isVirtual() && + MRI->getRegClass(Reg)->getID() == X86::TILERegClassID) + return true; + if (Reg >= X86::TMM0 && Reg <= X86::TMM7) + return true; + } - Register Reg = Op.getReg(); - if (Reg >= X86::TMM0 && Reg <= X86::TMM7) - return true; return false; } -static unsigned getTilePhysRegIdx(MachineOperand *Op) { - assert(isTilePhysReg(*Op) && "Tile Operand is invalid"); - return Op->getReg() - X86::TMM0; -} - -static inline void adjustRowCfg(unsigned TIdx, MachineInstr *MI) { - unsigned Offset = 48 + TIdx; - MI->getOperand(3).ChangeToImmediate(Offset); -} - -static inline void adjustColCfg(unsigned TIdx, MachineInstr *MI) { - unsigned Offset = 16 + TIdx * 2; - MI->getOperand(3).ChangeToImmediate(Offset); -} - -bool X86FastTileConfig::isTileLoad(MachineInstr &MI) { - return MI.getOpcode() == X86::PTILELOADDV || - MI.getOpcode() == X86::PTILELOADDT1V; -} -bool X86FastTileConfig::isTileStore(MachineInstr &MI) { - return MI.getOpcode() == X86::PTILESTOREDV; -} -bool X86FastTileConfig::isAMXInstr(MachineInstr &MI) { - // TODO: May need to handle some special nontile amx instrucion. - if (MI.getOpcode() == X86::PLDTILECFGV || MI.isDebugInstr()) - return false; - - return llvm::any_of(MI.operands(), isTilePhysReg); -} - -MachineInstr *X86FastTileConfig::getKeyAMXInstr(MachineInstr *MI) { - auto Cfg = MachineBasicBlock::iterator(MI); - MachineBasicBlock *MBB = MI->getParent(); - MachineInstr *KeyMI = nullptr; - int KeyAMXNum = 0; - - for (auto II = Cfg; II != MBB->end(); II++) { - if (isTileLoad(*II)) { - KeyMI = &*II; +// PreTileConfig should configure the tile registers based on basic +// block. +bool X86FastTileConfig::configBasicBlock(MachineBasicBlock &MBB) { + bool Change = false; + SmallVector, 6> ShapeInfos; + for (MachineInstr &MI : reverse(MBB)) { + if (!isTileDef(MRI, MI) && MI.getOpcode() != X86::LDTILECFG) continue; + // AMX instructions that define tile register. + if (MI.getOpcode() != X86::LDTILECFG) { + MachineOperand &Row = MI.getOperand(1); + MachineOperand &Col = MI.getOperand(2); + unsigned TMMIdx = MI.getOperand(0).getReg() - X86::TMM0; + ShapeInfos.push_back({TMMIdx, ShapeT(&Row, &Col)}); + } else { // LDTILECFG + // Rewrite the shape information to memory. Stack slot should have + // been initialized to zero in pre config. + int SS = MI.getOperand(0).getIndex(); // tile config stack slot. + for (auto &ShapeInfo : ShapeInfos) { + DebugLoc DL; + unsigned TMMIdx = ShapeInfo.first; + Register RowReg = ShapeInfo.second.getRow()->getReg(); + Register ColReg = ShapeInfo.second.getCol()->getReg(); + // Here is the data format for the tile config. + // 0 palette + // 1 start_row + // 2-15 reserved, must be zero + // 16-17 tile0.colsb Tile 0 bytes per row. + // 18-19 tile1.colsb Tile 1 bytes per row. + // 20-21 tile2.colsb Tile 2 bytes per row. + // ... (sequence continues) + // 30-31 tile7.colsb Tile 7 bytes per row. + // 32-47 reserved, must be zero + // 48 tile0.rows Tile 0 rows. + // 49 tile1.rows Tile 1 rows. + // 50 tile2.rows Tile 2 rows. + // ... (sequence continues) + // 55 tile7.rows Tile 7 rows. + // 56-63 reserved, must be zero + int RowOffset = 48 + TMMIdx; + int ColOffset = 16 + TMMIdx * 2; + MachineInstrBuilder StoreRow = + BuildMI(MBB, MI, DL, TII->get(X86::MOV8mr)); + addFrameReference(StoreRow, SS, RowOffset) + .addReg(TRI->getSubReg(RowReg, X86::sub_8bit)); + + MachineInstrBuilder StoreCol = + BuildMI(MBB, MI, DL, TII->get(X86::MOV16mr)); + addFrameReference(StoreCol, SS, ColOffset).addReg(ColReg); + } + ShapeInfos.clear(); + Change = true; } - - if (isTileStore(*II)) { - assert(KeyMI && "Key AMX Should be found before!"); - break; - } - - if (isAMXInstr(*II)) { - assert((KeyAMXNum == 0) && "Too many Key AMX instruction!"); - (void) KeyAMXNum; - KeyAMXNum++; - KeyMI = &*II; - } - } - assert(KeyMI && "There must be an AMX instruction."); - return KeyMI; -} - -// Orderly get the tiles in key amx instruction, uses before defs. -void X86FastTileConfig::getTileShapesCfg( - MachineInstr *CfgMI, SmallVector &ShapedTiles) { - MachineInstr *KeyMI = getKeyAMXInstr(CfgMI); - - SmallVector DefTiles; - for (MachineOperand &MO : KeyMI->operands()) { - if (!isTilePhysReg(MO)) - continue; - if (MO.isDef()) - DefTiles.push_back(&MO); - else - ShapedTiles.push_back(&MO); - } - ShapedTiles.append(DefTiles); -} - -// We pre-config the shapes at position named with "amx.tmm.N.shape.row* and -// amx.shape.N.col*" at pass "Pre AMX Tile Config". -// The 'N' implies the order of tiles in key amx intrinsic. -void X86FastTileConfig::getShapeCfgInstrs( - MachineInstr *MI, std::map &RowCfgs, - std::map &ColCfgs) { - auto Cfg = MachineBasicBlock::iterator(MI); - MachineBasicBlock *MBB = MI->getParent(); - - for (auto II = Cfg; II != MBB->begin(); II--) { - if (isAMXInstr(*II) || II->isTerminator() || II->isCall()) - break; - if (!II->mayStore() || !II->hasOneMemOperand()) - continue; - const Value *MemPtr = II->memoperands()[0]->getValue(); - if (!MemPtr) - continue; - - StringRef Name = MemPtr->getName(); - if (!Name.startswith("amx.tmm.")) - continue; - - // Get the 'N'th tile shape config in key amx instruction. - auto N = Name.find(".shape"); - StringRef STileIdx = Name.slice(8, N); - unsigned Idx; - STileIdx.getAsInteger(10, Idx); - - // And related them with their store instructions. - if (Name.contains("row")) - RowCfgs[Idx] = &*II; - else if (Name.contains("col")) - ColCfgs[Idx] = &*II; - else - llvm_unreachable("Invalid tile shape info!"); } - assert((RowCfgs.size() == ColCfgs.size()) && - "The number of tile row and col must be equal!"); -} - -// Here is the data format for the tile config. -// 0 palette = 1 now. -// 1 start_row = 0 now. -// 2-15 reserved, must be zero -// 16-17 tile0.colsb Tile 0 bytes per row. -// 18-19 tile1.colsb Tile 1 bytes per row. -// 20-21 tile2.colsb Tile 2 bytes per row. -// ... (sequence continues) -// 30-31 tile7.colsb Tile 7 bytes per row. -// 32-47 reserved, must be zero -// 48 tile0.rows Tile 0 rows. -// 49 tile1.rows Tile 1 rows. -// 50 tile2.rows Tile 2 rows. -// ... (sequence continues) -// 55 tile7.rows Tile 7 rows. -// 56-63 reserved, must be zero -void X86FastTileConfig::rewriteTileCfg( - SmallVector &ShapedTiles, - std::map &RowCfgs, - std::map &ColCfgs) { - assert((RowCfgs.size() == ShapedTiles.size()) && - "The number of tile shapes not equal with the number of tiles!"); - // Orderly get the tiles and adjust the shape config. - for (unsigned I = 0, E = ShapedTiles.size(); I < E; I++) { - MachineOperand *MO = ShapedTiles[I]; - unsigned TmmIdx = getTilePhysRegIdx(MO); - if (I == TmmIdx) - continue; - adjustRowCfg(TmmIdx, RowCfgs[I]); - adjustColCfg(TmmIdx, ColCfgs[I]); - } -} - -// We have already preconfig the shapes before fast register allocation at -// X86PreAMXConfig::preWriteTileCfg(). Now, we have done fast register -// allocation, the shapes pre-written before may not rightly corresponding -// to the correct tmm registers, so we need adjust them. -void X86FastTileConfig::materializeTileCfg(MachineInstr *CfgMI) { - SmallVector ShapedTiles; - std::map RowCfgs; - std::map ColCfgs; - - // Orderly keep the tile uses and def in ShapedTiles; - getTileShapesCfg(CfgMI, ShapedTiles); - assert(ShapedTiles.size() && "Not find shapes config!"); - - getShapeCfgInstrs(CfgMI, RowCfgs, ColCfgs); - - rewriteTileCfg(ShapedTiles, RowCfgs, ColCfgs); -} - -bool X86FastTileConfig::fastTileConfig() { - bool Changed = false; - - for (MachineBasicBlock &MBB : *MF) { - SmallVector CFGs; - for (MachineInstr &MI : MBB) - if (MI.getOpcode() == X86::PLDTILECFGV) - CFGs.push_back(&MI); - for (auto *MI : CFGs) - materializeTileCfg(MI); - if (!CFGs.empty()) - Changed = true; - } - if (Changed) + if (Change) X86FI->setHasVirtualTileReg(true); - return Changed; + + return Change; } bool X86FastTileConfig::runOnMachineFunction(MachineFunction &MFunc) { MF = &MFunc; MRI = &MFunc.getRegInfo(); - ST = &MFunc.getSubtarget(); + const TargetSubtargetInfo *ST = &MFunc.getSubtarget(); TRI = ST->getRegisterInfo(); TII = MFunc.getSubtarget().getInstrInfo(); X86FI = MFunc.getInfo(); + bool Change = false; + + // Loop over all of the basic blocks, eliminating virtual register references + for (MachineBasicBlock &MBB : MFunc) + Change |= configBasicBlock(MBB); - return fastTileConfig(); + return Change; } FunctionPass *llvm::createX86FastTileConfigPass() { diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -48,22 +48,23 @@ VEX, T8XD; // Pseduo instruction for RA. - let mayLoad = 1 in + let isPseudo = true, mayLoad = 1 in def PLDTILECFGV : PseudoI<(outs), (ins opaquemem:$src), [(int_x86_ldtilecfg_internal addr:$src)]>; - let mayLoad = 1 in + let isPseudo = true, mayLoad = 1 in def PTILELOADDV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2, opaquemem:$src3), []>; - let mayLoad = 1 in + let isPseudo = true, mayLoad = 1 in def PTILELOADDT1V : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2, opaquemem:$src3), []>; - let mayStore = 1 in + let isPseudo = true, mayStore = 1 in def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1, GR16:$src2, opaquemem:$src3, TILE:$src4), []>; - let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1 in + let isPseudo = true, isReMaterializable = 1, isAsCheapAsAMove = 1, + canFoldAsLoad = 1 in def PTILEZEROV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2), [(set TILE:$dst, (int_x86_tilezero_internal GR16:$src1, GR16:$src2))]>; @@ -106,7 +107,7 @@ } // Pseduo instruction for RA. - let Constraints = "$src4 = $dst" in { + let isPseudo = true, Constraints = "$src4 = $dst" in { def PTDPBSSDV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2, GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6), @@ -165,7 +166,7 @@ []>, VEX_4V, T8XS; // Pseduo instruction for RA. - let Constraints = "$src4 = $dst" in + let isPseudo = true, Constraints = "$src4 = $dst" in def PTDPBF16PSV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, GR16:$src2, GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6), diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -78,6 +78,7 @@ initializeX86CallFrameOptimizationPass(PR); initializeX86CmovConverterPassPass(PR); initializeX86TileConfigPass(PR); + initializeX86FastPreTileConfigPass(PR); initializeX86FastTileConfigPass(PR); initializeX86LowerTileCopyPass(PR); initializeX86ExpandPseudoPass(PR); @@ -420,8 +421,8 @@ addPass(createX86LowerAMXIntrinsicsPass()); addPass(createX86LowerAMXTypePass()); - if (TM->getOptLevel() == CodeGenOpt::None) - addPass(createX86PreAMXConfigPass()); + // if (TM->getOptLevel() == CodeGenOpt::None) + // addPass(createX86PreAMXConfigPass()); TargetPassConfig::addIRPasses(); @@ -511,9 +512,10 @@ addPass(createX86FlagsCopyLoweringPass()); addPass(createX86DynAllocaExpander()); - if (getOptLevel() != CodeGenOpt::None) { + if (getOptLevel() != CodeGenOpt::None) addPass(createX86PreTileConfigPass()); - } + else + addPass(createX86FastPreTileConfigPass()); } void X86PassConfig::addMachineSSAOptimization() { diff --git a/llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll b/llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll --- a/llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll @@ -1,18 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 -; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 -; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 | FileCheck %s --check-prefix=SSE2 - - -source_filename = "amx_api.c" %struct.__tile1024i_str = type <{ i16, i16, [60 x i8], <256 x i32> }> - @buf = dso_local global [1024 x i8] zeroinitializer, align 16 @buf2 = dso_local global [1024 x i8] zeroinitializer, align 16 -; Function Attrs: noinline nounwind optnone uwtable -define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) #0 { +define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) nounwind #0 { ; AVX512-LABEL: test_api: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: pushq %rbp @@ -22,6 +15,9 @@ ; AVX512-NEXT: .cfi_def_cfa_register %rbp ; AVX512-NEXT: andq $-1024, %rsp # imm = 0xFC00 ; AVX512-NEXT: subq $25600, %rsp # imm = 0x6400 +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %dx, %ax ; AVX512-NEXT: movw %si, %cx ; AVX512-NEXT: movl %edi, {{[0-9]+}}(%rsp) @@ -30,6 +26,7 @@ ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi ; AVX512-NEXT: xorl %esi, %esi ; AVX512-NEXT: movl $1088, %edx # imm = 0x440 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq memset@PLT ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax ; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) @@ -54,10 +51,12 @@ ; AVX512-NEXT: # %bb.1: # %if.then ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $buf, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movabsq $buf, %rax +; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movw (%rax), %si +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movw 2(%rax), %dx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -67,15 +66,11 @@ ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %dil -; AVX512-NEXT: movb %dil, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: addq $64, %rdx @@ -83,37 +78,39 @@ ; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $buf, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movabsq $buf, %rax +; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movw (%rax), %di +; AVX512-NEXT: movw (%rax), %si +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movw 2(%rax), %dx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %r8b -; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX512-NEXT: tileloadd (%rdx,%rdi), %tmm0 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: addq $64, %rdx +; AVX512-NEXT: movl $64, %esi ; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $buf, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movabsq $buf, %rax +; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movw (%rax), %si +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movw 2(%rax), %dx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -123,15 +120,11 @@ ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %r8b -; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg (%rdi) ; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: addq $64, %rdx @@ -141,10 +134,12 @@ ; AVX512-NEXT: .LBB0_2: # %if.else ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $buf2, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movabsq $buf2, %rax +; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movw (%rax), %si +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movw 2(%rax), %dx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -154,15 +149,11 @@ ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %dil -; AVX512-NEXT: movb %dil, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: addq $64, %rdx @@ -170,37 +161,39 @@ ; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $buf2, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movabsq $buf2, %rax +; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movw (%rax), %di +; AVX512-NEXT: movw (%rax), %si +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movw 2(%rax), %dx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %r8b -; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX512-NEXT: tileloadd (%rdx,%rdi), %tmm0 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: addq $64, %rdx +; AVX512-NEXT: movl $64, %esi ; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $buf2, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movabsq $buf2, %rax +; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movw (%rax), %si +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movw 2(%rax), %dx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -210,15 +203,11 @@ ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %r8b -; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg (%rdi) ; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: addq $64, %rdx @@ -229,7 +218,6 @@ ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi ; AVX512-NEXT: movl $1088, %edx # imm = 0x440 ; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq memcpy@PLT ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi @@ -526,23 +514,17 @@ ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d -; AVX512-NEXT: movw %r10w, %di -; AVX512-NEXT: shrl $2, %r10d -; AVX512-NEXT: movw %r10w, %r9w -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %r8b -; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %r8d +; AVX512-NEXT: movw %r8w, %di +; AVX512-NEXT: shrl $2, %r8d +; AVX512-NEXT: movw %r8w, %r9w +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp) -; AVX512-NEXT: # kill: def $r10b killed $r10b killed $r10d -; AVX512-NEXT: movb %r10b, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %r9b, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; AVX512-NEXT: movl $64, %r8d @@ -609,9 +591,9 @@ ; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Reload ; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %dx # 2-byte Reload ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: # kill: def $rdi killed $rax ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3 @@ -627,12 +609,10 @@ ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm16 ; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm16, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp) @@ -648,18 +628,15 @@ ; AVX512-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r8 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %r9b -; AVX512-NEXT: movb %r9b, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg (%r8) ; AVX512-NEXT: movl $64, %r8d ; AVX512-NEXT: tileloadd (%rdi,%r8), %tmm0 ; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) @@ -669,7 +646,6 @@ ; AVX512-NEXT: tilerelease ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq -; ; AVX2-LABEL: test_api: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: pushq %rbp @@ -679,6 +655,10 @@ ; AVX2-NEXT: .cfi_def_cfa_register %rbp ; AVX2-NEXT: andq $-1024, %rsp # imm = 0xFC00 ; AVX2-NEXT: subq $29696, %rsp # imm = 0x7400 +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movw %dx, %ax ; AVX2-NEXT: movw %si, %cx ; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) @@ -687,6 +667,7 @@ ; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi ; AVX2-NEXT: xorl %esi, %esi ; AVX2-NEXT: movl $1088, %edx # imm = 0x440 +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq memset@PLT ; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax ; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) @@ -711,10 +692,12 @@ ; AVX2-NEXT: # %bb.1: # %if.then ; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq $buf, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movabsq $buf, %rax +; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movw (%rax), %si +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movw 2(%rax), %dx ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -724,16 +707,11 @@ ; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax ; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %al, %dil -; AVX2-NEXT: movb %dil, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; AVX2-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX2-NEXT: addq $64, %rdx @@ -741,38 +719,39 @@ ; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi) ; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq $buf, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movabsq $buf, %rax +; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movw (%rax), %di +; AVX2-NEXT: movw (%rax), %si +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movw 2(%rax), %dx ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movw %di, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movw %si, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax ; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %al, %r8b -; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX2-NEXT: tileloadd (%rdx,%rdi), %tmm0 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AVX2-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX2-NEXT: addq $64, %rdx +; AVX2-NEXT: movl $64, %esi ; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi) ; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq $buf, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movabsq $buf, %rax +; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movw (%rax), %si +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movw 2(%rax), %dx ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -782,16 +761,11 @@ ; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax ; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %al, %r8b -; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: ldtilecfg (%rdi) ; AVX2-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX2-NEXT: addq $64, %rdx @@ -801,10 +775,12 @@ ; AVX2-NEXT: .LBB0_2: # %if.else ; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq $buf2, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movabsq $buf2, %rax +; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movw (%rax), %si +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movw 2(%rax), %dx ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -814,16 +790,11 @@ ; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax ; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %al, %dil -; AVX2-NEXT: movb %dil, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; AVX2-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX2-NEXT: addq $64, %rdx @@ -831,38 +802,39 @@ ; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi) ; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq $buf2, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movabsq $buf2, %rax +; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movw (%rax), %di +; AVX2-NEXT: movw (%rax), %si +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movw 2(%rax), %dx ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movw %di, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movw %si, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax ; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %al, %r8b -; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX2-NEXT: tileloadd (%rdx,%rdi), %tmm0 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AVX2-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX2-NEXT: addq $64, %rdx +; AVX2-NEXT: movl $64, %esi ; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi) ; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq $buf2, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movabsq $buf2, %rax +; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movw (%rax), %si +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movw 2(%rax), %dx ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -872,16 +844,11 @@ ; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax ; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %al, %r8b -; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: ldtilecfg (%rdi) ; AVX2-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX2-NEXT: addq $64, %rdx @@ -892,7 +859,6 @@ ; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi ; AVX2-NEXT: movl $1088, %edx # imm = 0x440 ; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq memcpy@PLT ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi @@ -1641,24 +1607,17 @@ ; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax ; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX2-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d -; AVX2-NEXT: movw %r10w, %di -; AVX2-NEXT: shrl $2, %r10d -; AVX2-NEXT: movw %r10w, %r9w -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %al, %r8b -; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movzwl {{[0-9]+}}(%rsp), %r8d +; AVX2-NEXT: movw %r8w, %di +; AVX2-NEXT: shrl $2, %r8d +; AVX2-NEXT: movw %r8w, %r9w +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %di, {{[0-9]+}}(%rsp) -; AVX2-NEXT: # kill: def $r10b killed $r10b killed $r10d -; AVX2-NEXT: movb %r10b, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movb %r9b, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movw %di, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; AVX2-NEXT: movl $64, %r8d @@ -1791,6 +1750,8 @@ ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX2-NEXT: # kill: def $rdi killed $rax ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 +; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm2 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm3 @@ -1837,8 +1798,6 @@ ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 ; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 ; AVX2-NEXT: movw %si, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp) @@ -1875,8 +1834,6 @@ ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm15, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovaps %ymm14, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovaps %ymm13, {{[0-9]+}}(%rsp) @@ -1892,19 +1849,15 @@ ; AVX2-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax ; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %r8 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %al, %r9b -; AVX2-NEXT: movb %r9b, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: ldtilecfg (%r8) ; AVX2-NEXT: movl $64, %r8d ; AVX2-NEXT: tileloadd (%rdi,%r8), %tmm0 ; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi) @@ -1914,7 +1867,6 @@ ; AVX2-NEXT: tilerelease ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq -; ; SSE2-LABEL: test_api: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: pushq %rbp @@ -1924,6 +1876,12 @@ ; SSE2-NEXT: .cfi_def_cfa_register %rbp ; SSE2-NEXT: andq $-1024, %rsp # imm = 0xFC00 ; SSE2-NEXT: subq $30720, %rsp # imm = 0x7800 +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movw %dx, %ax ; SSE2-NEXT: movw %si, %cx ; SSE2-NEXT: movl %edi, {{[0-9]+}}(%rsp) @@ -1956,10 +1914,12 @@ ; SSE2-NEXT: # %bb.1: # %if.then ; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq $buf, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movabsq $buf, %rax +; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movw (%rax), %si +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movw 2(%rax), %dx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -1969,19 +1929,11 @@ ; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax ; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %al, %dil -; SSE2-NEXT: movb %dil, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movb %al, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SSE2-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; SSE2-NEXT: addq $64, %rdx @@ -1989,40 +1941,39 @@ ; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi) ; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq $buf, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movabsq $buf, %rax +; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movw (%rax), %di +; SSE2-NEXT: movw (%rax), %si +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movw 2(%rax), %dx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movw %di, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movw %si, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax ; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %al, %r8b -; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movb %al, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; SSE2-NEXT: tileloadd (%rdx,%rdi), %tmm0 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SSE2-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; SSE2-NEXT: addq $64, %rdx +; SSE2-NEXT: movl $64, %esi ; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi) ; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq $buf, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movabsq $buf, %rax +; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movw (%rax), %si +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movw 2(%rax), %dx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -2032,18 +1983,11 @@ ; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax ; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx +; SSE2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %al, %r8b -; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: ldtilecfg (%rdi) ; SSE2-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; SSE2-NEXT: addq $64, %rdx @@ -2053,10 +1997,12 @@ ; SSE2-NEXT: .LBB0_2: # %if.else ; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq $buf2, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movabsq $buf2, %rax +; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movw (%rax), %si +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movw 2(%rax), %dx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -2066,19 +2012,11 @@ ; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax ; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %al, %dil -; SSE2-NEXT: movb %dil, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movb %al, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SSE2-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; SSE2-NEXT: addq $64, %rdx @@ -2086,40 +2024,39 @@ ; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi) ; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq $buf2, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movabsq $buf2, %rax +; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movw (%rax), %di +; SSE2-NEXT: movw (%rax), %si +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movw 2(%rax), %dx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movw %di, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movw %si, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax ; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %al, %r8b -; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movb %al, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; SSE2-NEXT: tileloadd (%rdx,%rdi), %tmm0 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SSE2-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; SSE2-NEXT: addq $64, %rdx +; SSE2-NEXT: movl $64, %esi ; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi) ; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq $buf2, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movabsq $buf2, %rax +; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movw (%rax), %si +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movw 2(%rax), %dx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -2129,18 +2066,11 @@ ; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax ; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx +; SSE2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %al, %r8b -; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: ldtilecfg (%rdi) ; SSE2-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; SSE2-NEXT: addq $64, %rdx @@ -3666,26 +3596,17 @@ ; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax ; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx -; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d -; SSE2-NEXT: movw %r10w, %di -; SSE2-NEXT: shrl $2, %r10d -; SSE2-NEXT: movw %r10w, %r9w -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %al, %r8b -; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %r8d +; SSE2-NEXT: movw %r8w, %di +; SSE2-NEXT: shrl $2, %r8d +; SSE2-NEXT: movw %r8w, %r9w +; SSE2-NEXT: movb %al, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %di, {{[0-9]+}}(%rsp) -; SSE2-NEXT: # kill: def $r10b killed $r10b killed $r10d -; SSE2-NEXT: movb %r10b, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movb %r9b, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movw %di, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movb %al, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; SSE2-NEXT: movl $64, %r8d @@ -3944,6 +3865,8 @@ ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; SSE2-NEXT: # kill: def $rdi killed $rax ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 @@ -4054,8 +3977,6 @@ ; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; SSE2-NEXT: movw %si, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp) @@ -4156,8 +4077,6 @@ ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: movaps %xmm15, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movaps %xmm14, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movaps %xmm13, {{[0-9]+}}(%rsp) @@ -4173,21 +4092,15 @@ ; SSE2-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax ; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx +; SSE2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %r8 -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %al, %r9b -; SSE2-NEXT: movb %r9b, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: ldtilecfg (%r8) ; SSE2-NEXT: movl $64, %r8d ; SSE2-NEXT: tileloadd (%rdi,%r8), %tmm0 ; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi) @@ -4538,19 +4451,10 @@ ret void } -; Function Attrs: argmemonly nofree nosync nounwind willreturn writeonly declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #1 - -; Function Attrs: nounwind declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) #2 - -; Function Attrs: nounwind declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #2 - -; Function Attrs: nounwind declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) #2 - -; Function Attrs: argmemonly nofree nosync nounwind willreturn declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #3 attributes #0 = { noinline nounwind optnone uwtable } diff --git a/llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll b/llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll --- a/llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll @@ -6,17 +6,16 @@ @buf = dso_local global [1024 x i8] zeroinitializer, align 16 @buf2 = dso_local global [1024 x i8] zeroinitializer, align 16 -; Function Attrs: nounwind uwtable -define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) local_unnamed_addr { +define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) nounwind { ; AVX512-LABEL: test_api: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: .cfi_def_cfa_offset 16 -; AVX512-NEXT: .cfi_offset %rbp, -16 ; AVX512-NEXT: movq %rsp, %rbp -; AVX512-NEXT: .cfi_def_cfa_register %rbp ; AVX512-NEXT: andq $-1024, %rsp # imm = 0xFC00 -; AVX512-NEXT: subq $6144, %rsp # imm = 0x1800 +; AVX512-NEXT: subq $8192, %rsp # imm = 0x2000 +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %dx, %ax ; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512-NEXT: movw %si, %ax @@ -36,34 +35,29 @@ ; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %sil -; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload ; AVX512-NEXT: movl $buf, %r9d ; AVX512-NEXT: movl $32, %r10d ; AVX512-NEXT: movw $8, %si +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp) +; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; AVX512-NEXT: tileloadd (%r9,%r10), %tmm0 -; AVX512-NEXT: movl $64, %r8d -; AVX512-NEXT: tilestored %tmm0, (%r11,%r8) -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $8, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movl $64, %r9d +; AVX512-NEXT: movw $8, %si +; AVX512-NEXT: tilestored %tmm0, (%r8,%r9) +; AVX512-NEXT: movl $buf, %r8d +; AVX512-NEXT: movl $32, %r9d +; AVX512-NEXT: movw $8, %si +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX512-NEXT: tileloadd (%r9,%r10), %tmm0 +; AVX512-NEXT: tileloadd (%r8,%r9), %tmm0 +; AVX512-NEXT: movl $64, %r8d +; AVX512-NEXT: movw $8, %si ; AVX512-NEXT: tilestored %tmm0, (%rdi,%r8) -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %dil -; AVX512-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg (%rsi) ; AVX512-NEXT: movl $buf, %esi ; AVX512-NEXT: movl $32, %edi ; AVX512-NEXT: tileloadd (%rsi,%rdi), %tmm0 @@ -75,34 +69,29 @@ ; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %sil -; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload ; AVX512-NEXT: movl $buf2, %r9d ; AVX512-NEXT: movl $32, %r10d ; AVX512-NEXT: movw $8, %si +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp) +; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; AVX512-NEXT: tileloadd (%r9,%r10), %tmm0 -; AVX512-NEXT: movl $64, %r8d -; AVX512-NEXT: tilestored %tmm0, (%r11,%r8) -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $8, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movl $64, %r9d +; AVX512-NEXT: movw $8, %si +; AVX512-NEXT: tilestored %tmm0, (%r8,%r9) +; AVX512-NEXT: movl $buf2, %r8d +; AVX512-NEXT: movl $32, %r9d +; AVX512-NEXT: movw $8, %si +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX512-NEXT: tileloadd (%r9,%r10), %tmm0 +; AVX512-NEXT: tileloadd (%r8,%r9), %tmm0 +; AVX512-NEXT: movl $64, %r8d +; AVX512-NEXT: movw $8, %si ; AVX512-NEXT: tilestored %tmm0, (%rdi,%r8) -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %dil -; AVX512-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg (%rsi) ; AVX512-NEXT: movl $buf2, %esi ; AVX512-NEXT: movl $32, %edi ; AVX512-NEXT: tileloadd (%rsi,%rdi), %tmm0 @@ -112,36 +101,39 @@ ; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload ; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %sil -; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: movl $64, %r10d +; AVX512-NEXT: movw $8, %di +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp) +; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AVX512-NEXT: tileloadd (%r8,%r10), %tmm0 +; AVX512-NEXT: movabsq $64, %r8 +; AVX512-NEXT: tilestored %tmm0, 1024(%rsp,%r8) # 1024-byte Folded Spill +; AVX512-NEXT: movl $64, %r10d +; AVX512-NEXT: movw $8, %r8w +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $8, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX512-NEXT: movl $64, %esi -; AVX512-NEXT: movw $8, %di -; AVX512-NEXT: tileloadd (%r10,%rsi), %tmm1 -; AVX512-NEXT: tileloadd (%r9,%rsi), %tmm2 -; AVX512-NEXT: tileloadd (%r8,%rsi), %tmm0 +; AVX512-NEXT: tileloadd (%r9,%r10), %tmm2 +; AVX512-NEXT: movl $64, %r8d +; AVX512-NEXT: tileloadd (%rsi,%r8), %tmm0 +; AVX512-NEXT: movw $8, %si +; AVX512-NEXT: movabsq $64, %r8 +; AVX512-NEXT: tileloadd 1024(%rsp,%r8), %tmm1 # 1024-byte Folded Reload ; AVX512-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 +; AVX512-NEXT: movl $64, %esi ; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %dil -; AVX512-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg (%rsi) ; AVX512-NEXT: movl $64, %esi ; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; AVX512-NEXT: movl $buf, %edx @@ -149,7 +141,6 @@ ; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp -; AVX512-NEXT: .cfi_def_cfa %rsp, 8 ; AVX512-NEXT: tilerelease ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -157,12 +148,13 @@ ; AVX2-LABEL: test_api: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: .cfi_def_cfa_offset 16 -; AVX2-NEXT: .cfi_offset %rbp, -16 ; AVX2-NEXT: movq %rsp, %rbp -; AVX2-NEXT: .cfi_def_cfa_register %rbp ; AVX2-NEXT: andq $-1024, %rsp # imm = 0xFC00 -; AVX2-NEXT: subq $6144, %rsp # imm = 0x1800 +; AVX2-NEXT: subq $8192, %rsp # imm = 0x2000 +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movw %dx, %ax ; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX2-NEXT: movw %si, %ax @@ -182,37 +174,29 @@ ; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %al, %sil -; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw $8, {{[0-9]+}}(%rsp) -; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload ; AVX2-NEXT: movl $buf, %r9d ; AVX2-NEXT: movl $32, %r10d ; AVX2-NEXT: movw $8, %si +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movw %si, {{[0-9]+}}(%rsp) +; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; AVX2-NEXT: tileloadd (%r9,%r10), %tmm0 -; AVX2-NEXT: movl $64, %r8d -; AVX2-NEXT: tilestored %tmm0, (%r11,%r8) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $8, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movl $64, %r9d +; AVX2-NEXT: movw $8, %si +; AVX2-NEXT: tilestored %tmm0, (%r8,%r9) +; AVX2-NEXT: movl $buf, %r8d +; AVX2-NEXT: movl $32, %r9d +; AVX2-NEXT: movw $8, %si +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX2-NEXT: tileloadd (%r9,%r10), %tmm0 +; AVX2-NEXT: tileloadd (%r8,%r9), %tmm0 +; AVX2-NEXT: movl $64, %r8d +; AVX2-NEXT: movw $8, %si ; AVX2-NEXT: tilestored %tmm0, (%rdi,%r8) -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %al, %dil -; AVX2-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: ldtilecfg (%rsi) ; AVX2-NEXT: movl $buf, %esi ; AVX2-NEXT: movl $32, %edi ; AVX2-NEXT: tileloadd (%rsi,%rdi), %tmm0 @@ -224,37 +208,29 @@ ; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %al, %sil -; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw $8, {{[0-9]+}}(%rsp) -; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload ; AVX2-NEXT: movl $buf2, %r9d ; AVX2-NEXT: movl $32, %r10d ; AVX2-NEXT: movw $8, %si +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movw %si, {{[0-9]+}}(%rsp) +; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; AVX2-NEXT: tileloadd (%r9,%r10), %tmm0 -; AVX2-NEXT: movl $64, %r8d -; AVX2-NEXT: tilestored %tmm0, (%r11,%r8) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $8, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movl $64, %r9d +; AVX2-NEXT: movw $8, %si +; AVX2-NEXT: tilestored %tmm0, (%r8,%r9) +; AVX2-NEXT: movl $buf2, %r8d +; AVX2-NEXT: movl $32, %r9d +; AVX2-NEXT: movw $8, %si +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX2-NEXT: tileloadd (%r9,%r10), %tmm0 +; AVX2-NEXT: tileloadd (%r8,%r9), %tmm0 +; AVX2-NEXT: movl $64, %r8d +; AVX2-NEXT: movw $8, %si ; AVX2-NEXT: tilestored %tmm0, (%rdi,%r8) -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %al, %dil -; AVX2-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: ldtilecfg (%rsi) ; AVX2-NEXT: movl $buf2, %esi ; AVX2-NEXT: movl $32, %edi ; AVX2-NEXT: tileloadd (%rsi,%rdi), %tmm0 @@ -264,38 +240,39 @@ ; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload ; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %al, %sil -; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: movl $64, %r10d +; AVX2-NEXT: movw $8, %di +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movw %di, {{[0-9]+}}(%rsp) +; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AVX2-NEXT: tileloadd (%r8,%r10), %tmm0 +; AVX2-NEXT: movabsq $64, %r8 +; AVX2-NEXT: tilestored %tmm0, 1024(%rsp,%r8) # 1024-byte Folded Spill +; AVX2-NEXT: movl $64, %r10d +; AVX2-NEXT: movw $8, %r8w +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw $8, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $8, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movw %di, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) ; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX2-NEXT: movl $64, %esi -; AVX2-NEXT: movw $8, %di -; AVX2-NEXT: tileloadd (%r10,%rsi), %tmm1 -; AVX2-NEXT: tileloadd (%r9,%rsi), %tmm2 -; AVX2-NEXT: tileloadd (%r8,%rsi), %tmm0 +; AVX2-NEXT: tileloadd (%r9,%r10), %tmm2 +; AVX2-NEXT: movl $64, %r8d +; AVX2-NEXT: tileloadd (%rsi,%r8), %tmm0 +; AVX2-NEXT: movw $8, %si +; AVX2-NEXT: movabsq $64, %r8 +; AVX2-NEXT: tileloadd 1024(%rsp,%r8), %tmm1 # 1024-byte Folded Reload ; AVX2-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 +; AVX2-NEXT: movl $64, %esi ; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %al, %dil -; AVX2-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: ldtilecfg (%rsi) ; AVX2-NEXT: movl $64, %esi ; AVX2-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; AVX2-NEXT: movl $buf, %edx @@ -303,7 +280,6 @@ ; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi) ; AVX2-NEXT: movq %rbp, %rsp ; AVX2-NEXT: popq %rbp -; AVX2-NEXT: .cfi_def_cfa %rsp, 8 ; AVX2-NEXT: tilerelease ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -311,12 +287,15 @@ ; SSE2-LABEL: test_api: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: .cfi_def_cfa_offset 16 -; SSE2-NEXT: .cfi_offset %rbp, -16 ; SSE2-NEXT: movq %rsp, %rbp -; SSE2-NEXT: .cfi_def_cfa_register %rbp ; SSE2-NEXT: andq $-1024, %rsp # imm = 0xFC00 -; SSE2-NEXT: subq $6144, %rsp # imm = 0x1800 +; SSE2-NEXT: subq $8192, %rsp # imm = 0x2000 +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movw %dx, %ax ; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; SSE2-NEXT: movw %si, %ax @@ -336,44 +315,29 @@ ; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %al, %sil -; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw $8, {{[0-9]+}}(%rsp) -; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload ; SSE2-NEXT: movl $buf, %r9d ; SSE2-NEXT: movl $32, %r10d ; SSE2-NEXT: movw $8, %si +; SSE2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movw %si, {{[0-9]+}}(%rsp) +; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; SSE2-NEXT: tileloadd (%r9,%r10), %tmm0 -; SSE2-NEXT: movl $64, %r8d -; SSE2-NEXT: tilestored %tmm0, (%r11,%r8) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $8, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movl $64, %r9d +; SSE2-NEXT: movw $8, %si +; SSE2-NEXT: tilestored %tmm0, (%r8,%r9) +; SSE2-NEXT: movl $buf, %r8d +; SSE2-NEXT: movl $32, %r9d +; SSE2-NEXT: movw $8, %si +; SSE2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; SSE2-NEXT: tileloadd (%r9,%r10), %tmm0 +; SSE2-NEXT: tileloadd (%r8,%r9), %tmm0 +; SSE2-NEXT: movl $64, %r8d +; SSE2-NEXT: movw $8, %si ; SSE2-NEXT: tilestored %tmm0, (%rdi,%r8) -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %al, %dil -; SSE2-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: ldtilecfg (%rsi) ; SSE2-NEXT: movl $buf, %esi ; SSE2-NEXT: movl $32, %edi ; SSE2-NEXT: tileloadd (%rsi,%rdi), %tmm0 @@ -385,44 +349,29 @@ ; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %al, %sil -; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw $8, {{[0-9]+}}(%rsp) -; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload ; SSE2-NEXT: movl $buf2, %r9d ; SSE2-NEXT: movl $32, %r10d ; SSE2-NEXT: movw $8, %si +; SSE2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movw %si, {{[0-9]+}}(%rsp) +; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; SSE2-NEXT: tileloadd (%r9,%r10), %tmm0 -; SSE2-NEXT: movl $64, %r8d -; SSE2-NEXT: tilestored %tmm0, (%r11,%r8) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $8, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movl $64, %r9d +; SSE2-NEXT: movw $8, %si +; SSE2-NEXT: tilestored %tmm0, (%r8,%r9) +; SSE2-NEXT: movl $buf2, %r8d +; SSE2-NEXT: movl $32, %r9d +; SSE2-NEXT: movw $8, %si +; SSE2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; SSE2-NEXT: tileloadd (%r9,%r10), %tmm0 +; SSE2-NEXT: tileloadd (%r8,%r9), %tmm0 +; SSE2-NEXT: movl $64, %r8d +; SSE2-NEXT: movw $8, %si ; SSE2-NEXT: tilestored %tmm0, (%rdi,%r8) -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %al, %dil -; SSE2-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: ldtilecfg (%rsi) ; SSE2-NEXT: movl $buf2, %esi ; SSE2-NEXT: movl $32, %edi ; SSE2-NEXT: tileloadd (%rsi,%rdi), %tmm0 @@ -432,42 +381,39 @@ ; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload ; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %al, %sil -; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE2-NEXT: movl $64, %r10d +; SSE2-NEXT: movw $8, %di +; SSE2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movw %di, {{[0-9]+}}(%rsp) +; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; SSE2-NEXT: tileloadd (%r8,%r10), %tmm0 +; SSE2-NEXT: movabsq $64, %r8 +; SSE2-NEXT: tilestored %tmm0, 1024(%rsp,%r8) # 1024-byte Folded Spill +; SSE2-NEXT: movl $64, %r10d +; SSE2-NEXT: movw $8, %r8w +; SSE2-NEXT: movb %al, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw $8, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $8, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movb %al, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movw %di, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) ; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; SSE2-NEXT: movl $64, %esi -; SSE2-NEXT: movw $8, %di -; SSE2-NEXT: tileloadd (%r10,%rsi), %tmm1 -; SSE2-NEXT: tileloadd (%r9,%rsi), %tmm2 -; SSE2-NEXT: tileloadd (%r8,%rsi), %tmm0 +; SSE2-NEXT: tileloadd (%r9,%r10), %tmm2 +; SSE2-NEXT: movl $64, %r8d +; SSE2-NEXT: tileloadd (%rsi,%r8), %tmm0 +; SSE2-NEXT: movw $8, %si +; SSE2-NEXT: movabsq $64, %r8 +; SSE2-NEXT: tileloadd 1024(%rsp,%r8), %tmm1 # 1024-byte Folded Reload ; SSE2-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 +; SSE2-NEXT: movl $64, %esi ; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi) -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %al, %dil -; SSE2-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: ldtilecfg (%rsi) ; SSE2-NEXT: movl $64, %esi ; SSE2-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; SSE2-NEXT: movl $buf, %edx @@ -475,7 +421,6 @@ ; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi) ; SSE2-NEXT: movq %rbp, %rsp ; SSE2-NEXT: popq %rbp -; SSE2-NEXT: .cfi_def_cfa %rsp, 8 ; SSE2-NEXT: tilerelease ; SSE2-NEXT: retq entry: @@ -503,11 +448,6 @@ ret void } -; Function Attrs: nounwind declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) - -; Function Attrs: nounwind declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) - -; Function Attrs: nounwind declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) diff --git a/llvm/test/CodeGen/X86/AMX/amx-fastconfig-spill.mir b/llvm/test/CodeGen/X86/AMX/amx-fastconfig-spill.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/AMX/amx-fastconfig-spill.mir @@ -0,0 +1,154 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=x86_64-- -mattr=+amx-int8,avx512f -run-pass=fastpretileconfig -o - %s | FileCheck %s + +# Test spill/reload across basic block. + +--- +name: foo +alignment: 16 +tracksRegLiveness: true +registers: + - { id: 0, class: gr16 } + - { id: 1, class: gr16 } + - { id: 2, class: tile } + - { id: 3, class: gr64_nosp } + - { id: 4, class: gr64 } + - { id: 5, class: tile } + - { id: 6, class: tile } + - { id: 7, class: tile } + - { id: 8, class: gr32 } + - { id: 9, class: vr512 } +frameInfo: + maxAlignment: 16 +stack: + - { id: 0, size: 1024, alignment: 16 } + - { id: 1, size: 64, alignment: 4 } +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: foo + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0 + ; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.4, align 4) + ; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4) + ; CHECK-NEXT: [[MOV16ri:%[0-9]+]]:gr16 = MOV16ri 32 + ; CHECK-NEXT: [[MOV16ri1:%[0-9]+]]:gr16 = MOV16ri 8 + ; CHECK-NEXT: LDTILECFG %stack.4, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load store (s512) on %stack.4, align 4) + ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[MOV16ri1]], [[MOV16ri]] + ; CHECK-NEXT: [[MOV64ri:%[0-9]+]]:gr64_nosp = MOV64ri 64 + ; CHECK-NEXT: TILESTORED %stack.3, 1, killed [[MOV64ri]], 0, $noreg, [[PTILEZEROV]] :: (store (s8192) into %stack.3) + ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 32 + ; CHECK-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r %stack.0, 1, $noreg, 0, $noreg + ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri1]], [[MOV16ri]], [[LEA64r]], 1, [[MOV32ri64_]], 0, $noreg + ; CHECK-NEXT: [[MOV64ri1:%[0-9]+]]:gr64_nosp = MOV64ri 64 + ; CHECK-NEXT: TILESTORED %stack.2, 1, killed [[MOV64ri1]], 0, $noreg, [[PTILELOADDV]] :: (store (s8192) into %stack.2) + ; CHECK-NEXT: %row:gr16 = MOV16ri 32 + ; CHECK-NEXT: %col:gr16 = MOV16ri 8 + ; CHECK-NEXT: JMP_1 %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: LDTILECFG %stack.4, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load store (s512) on %stack.4, align 4) + ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV %row, %col, [[LEA64r]], 1, [[MOV32ri64_]], 0, $noreg + ; CHECK-NEXT: [[MOV64ri2:%[0-9]+]]:gr64_nosp = MOV64ri 64 + ; CHECK-NEXT: [[PTILELOADDV2:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri1]], [[MOV16ri]], %stack.2, 1, killed [[MOV64ri2]], 0, $noreg :: (load (s8192) from %stack.2) + ; CHECK-NEXT: [[MOV64ri3:%[0-9]+]]:gr64_nosp = MOV64ri 64 + ; CHECK-NEXT: [[PTILELOADDV3:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri1]], [[MOV16ri]], %stack.3, 1, killed [[MOV64ri3]], 0, $noreg :: (load (s8192) from %stack.3) + ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[MOV16ri1]], [[MOV16ri]], [[MOV16ri]], killed [[PTILELOADDV1]], killed [[PTILELOADDV3]], killed [[PTILELOADDV2]] + ; CHECK-NEXT: PTILESTOREDV killed [[MOV16ri1]], killed [[MOV16ri]], killed [[LEA64r]], 1, killed [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]] + ; CHECK-NEXT: [[MOV32r0_:%[0-9]+]]:gr32 = MOV32r0 implicit-def dead $eflags + ; CHECK-NEXT: $eax = COPY killed [[MOV32r0_]] + ; CHECK-NEXT: RET 0, killed $eax + bb.0.entry: + %0:gr16 = MOV16ri 32 + %1:gr16 = MOV16ri 8 + %2:tile = PTILEZEROV %1, %0 + %3:gr64_nosp = MOV32ri64 32 + %4:gr64 = LEA64r %stack.0, 1, $noreg, 0, $noreg + %5:tile = PTILELOADDV %1, %0, %4, 1, %3, 0, $noreg + %row:gr16 = MOV16ri 32 + %col:gr16 = MOV16ri 8 + JMP_1 %bb.1 + bb.1: + %6:tile = PTILELOADDV %row, %col, %4, 1, %3, 0, $noreg + %7:tile = PTDPBSSDV %1, %0, %0, killed %6, killed %2, killed %5 + PTILESTOREDV killed %1, killed %0, killed %4, 1, killed %3, 0, $noreg, killed %7 + %8:gr32 = MOV32r0 implicit-def dead $eflags + $eax = COPY killed %8 + RET 0, killed $eax + +... + +# Test tile copy fold +--- +name: copy +alignment: 16 +tracksRegLiveness: true +registers: + - { id: 0, class: gr16 } + - { id: 1, class: gr16 } + - { id: 2, class: tile } + - { id: 3, class: gr64_nosp } + - { id: 4, class: gr64 } + - { id: 5, class: tile } + - { id: 6, class: tile } + - { id: 7, class: tile } + - { id: 8, class: gr32 } + - { id: 9, class: vr512 } +frameInfo: + maxAlignment: 16 +stack: + - { id: 0, size: 1024, alignment: 16 } + - { id: 1, size: 64, alignment: 4 } +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: copy + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0 + ; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.4, align 4) + ; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4) + ; CHECK-NEXT: [[MOV16ri:%[0-9]+]]:gr16 = MOV16ri 32 + ; CHECK-NEXT: [[MOV16ri1:%[0-9]+]]:gr16 = MOV16ri 8 + ; CHECK-NEXT: LDTILECFG %stack.4, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load store (s512) on %stack.4, align 4) + ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[MOV16ri1]], [[MOV16ri]] + ; CHECK-NEXT: [[MOV64ri:%[0-9]+]]:gr64_nosp = MOV64ri 64 + ; CHECK-NEXT: TILESTORED %stack.3, 1, killed [[MOV64ri]], 0, $noreg, [[PTILEZEROV]] :: (store (s8192) into %stack.3) + ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 32 + ; CHECK-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r %stack.0, 1, $noreg, 0, $noreg + ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri1]], [[MOV16ri]], [[LEA64r]], 1, [[MOV32ri64_]], 0, $noreg + ; CHECK-NEXT: [[MOV64ri1:%[0-9]+]]:gr64_nosp = MOV64ri 64 + ; CHECK-NEXT: TILESTORED %stack.2, 1, killed [[MOV64ri1]], 0, $noreg, [[PTILELOADDV]] :: (store (s8192) into %stack.2) + ; CHECK-NEXT: JMP_1 %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: LDTILECFG %stack.4, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load store (s512) on %stack.4, align 4) + ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri1]], [[MOV16ri]], [[LEA64r]], 1, [[MOV32ri64_]], 0, $noreg + ; CHECK-NEXT: [[MOV64ri2:%[0-9]+]]:gr64_nosp = MOV64ri 64 + ; CHECK-NEXT: %t:tile = PTILELOADDV [[MOV16ri1]], [[MOV16ri]], %stack.2, 1, killed [[MOV64ri2]], 0, $noreg :: (load (s8192) from %stack.2) + ; CHECK-NEXT: [[MOV64ri3:%[0-9]+]]:gr64_nosp = MOV64ri 64 + ; CHECK-NEXT: [[PTILELOADDV2:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri1]], [[MOV16ri]], %stack.3, 1, killed [[MOV64ri3]], 0, $noreg :: (load (s8192) from %stack.3) + ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[MOV16ri1]], [[MOV16ri]], [[MOV16ri]], killed [[PTILELOADDV1]], killed [[PTILELOADDV2]], killed %t + ; CHECK-NEXT: PTILESTOREDV killed [[MOV16ri1]], killed [[MOV16ri]], killed [[LEA64r]], 1, killed [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]] + ; CHECK-NEXT: [[MOV32r0_:%[0-9]+]]:gr32 = MOV32r0 implicit-def dead $eflags + ; CHECK-NEXT: $eax = COPY killed [[MOV32r0_]] + ; CHECK-NEXT: RET 0, killed $eax + bb.0.entry: + %0:gr16 = MOV16ri 32 + %1:gr16 = MOV16ri 8 + %2:tile = PTILEZEROV %1, %0 + %3:gr64_nosp = MOV32ri64 32 + %4:gr64 = LEA64r %stack.0, 1, $noreg, 0, $noreg + %5:tile = PTILELOADDV %1, %0, %4, 1, %3, 0, $noreg + JMP_1 %bb.1 + bb.1: + %6:tile = PTILELOADDV %1, %0, %4, 1, %3, 0, $noreg + %t:tile = COPY %5 + %7:tile = PTDPBSSDV %1, %0, %0, killed %6, killed %2, killed %t + PTILESTOREDV killed %1, killed %0, killed %4, 1, killed %3, 0, $noreg, killed %7 + %8:gr32 = MOV32r0 implicit-def dead $eflags + $eax = COPY killed %8 + RET 0, killed $eax + +... diff --git a/llvm/test/CodeGen/X86/AMX/amx-fastconfig.mir b/llvm/test/CodeGen/X86/AMX/amx-fastconfig.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/AMX/amx-fastconfig.mir @@ -0,0 +1,146 @@ +# RUN: llc -mtriple=x86_64-- -run-pass=fastpretileconfig -o - %s | FileCheck %s + +--- | + target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" + target triple = "x86_64-unknown-unknown" + + @buf = dso_local global [1024 x i8] zeroinitializer, align 16 + @buf2 = dso_local global [1024 x i8] zeroinitializer, align 16 + + define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) local_unnamed_addr #0 { + entry: + %tobool.not = icmp eq i32 %cond, 0 + br i1 %tobool.not, label %if.else, label %if.then + + if.then: ; preds = %entry + %0 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) + %1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) + %2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) + br label %if.end + + if.else: ; preds = %entry + %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) + %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) + %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) + br label %if.end + + if.end: ; preds = %if.else, %if.then + %a.sroa.1094.0.in = phi x86_amx [ %3, %if.else ], [ %0, %if.then ] + %b.sroa.1069.0.in = phi x86_amx [ %4, %if.else ], [ %1, %if.then ] + %c.sroa.1044.0.in = phi x86_amx [ %5, %if.else ], [ %2, %if.then ] + %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %c.sroa.1044.0.in, x86_amx %a.sroa.1094.0.in, x86_amx %b.sroa.1069.0.in) + tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %6) + ret void + } + + declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) #1 + declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #1 + declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) #1 + + attributes #0 = { "target-features"="+amx-int8,+avx512f" } + attributes #1 = { nounwind "target-features"="+amx-int8,+avx512f" } + +... +--- +name: test_api +alignment: 16 +tracksRegLiveness: true +registers: + - { id: 0, class: tile } + - { id: 1, class: tile } + - { id: 2, class: tile } + - { id: 3, class: tile } + - { id: 4, class: tile } + - { id: 5, class: tile } + - { id: 6, class: tile } + - { id: 7, class: tile } + - { id: 8, class: tile } + - { id: 9, class: gr32 } + - { id: 10, class: gr32 } + - { id: 11, class: gr32 } + - { id: 12, class: gr16 } + - { id: 13, class: gr16 } + - { id: 14, class: gr64 } + - { id: 15, class: gr64_nosp } + - { id: 16, class: gr16 } + - { id: 17, class: gr64 } + - { id: 18, class: gr64_nosp } + - { id: 19, class: gr16 } + - { id: 20, class: gr16 } + - { id: 21, class: tile } + - { id: 22, class: gr64 } + - { id: 23, class: gr64_nosp } +liveins: + - { reg: '$edi', virtual-reg: '%9' } + - { reg: '$esi', virtual-reg: '%10' } + - { reg: '$edx', virtual-reg: '%11' } +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0.entry: + successors: %bb.2(0x30000000), %bb.1(0x50000000) + liveins: $edi, $esi, $edx + + ; CHECK: {{%.*}}:vr512 = AVX512_512_SET0 + ; CHECK-NEXT: VMOVUPSZmr %stack.3, 1, $noreg, 0, $noreg, {{%.*}} + + %11:gr32 = COPY killed $edx + %10:gr32 = COPY killed $esi + %9:gr32 = COPY killed $edi + %13:gr16 = COPY killed %11.sub_16bit + %12:gr16 = COPY killed %10.sub_16bit + TEST32rr killed %9, %9, implicit-def $eflags + JCC_1 %bb.2, 4, implicit killed $eflags + JMP_1 %bb.1 + + bb.1.if.then: + %14:gr64 = MOV32ri64 @buf + %15:gr64_nosp = MOV32ri64 32 + %16:gr16 = MOV16ri 8 + ; CHECK: LDTILECFG + %0:tile = PTILELOADDV %12, %16, %14, 1, %15, 0, $noreg + %1:tile = PTILELOADDV killed %16, %13, %14, 1, %15, 0, $noreg + %2:tile = PTILELOADDV %12, %13, killed %14, 1, killed %15, 0, $noreg + JMP_1 %bb.3 + + bb.2.if.else: + %17:gr64 = MOV32ri64 @buf2 + %18:gr64_nosp = MOV32ri64 32 + %19:gr16 = MOV16ri 8 + ; CHECK: LDTILECFG + %3:tile = PTILELOADDV %12, %19, %17, 1, %18, 0, $noreg + %4:tile = PTILELOADDV killed %19, %13, %17, 1, %18, 0, $noreg + %5:tile = PTILELOADDV %12, %13, killed %17, 1, killed %18, 0, $noreg + + bb.3.if.end: + + ; CHECK: bb.3.if.end + ; CHECK-NEXT: %44:gr16 = PHI %16, %bb.1, %19, %bb.2 + ; CHECK-NEXT: %43:gr16 = PHI %12, %bb.1, %12, %bb.2 + ; CHECK-NEXT: %42:gr64_nosp = PHI %45, %bb.1, %46, %bb.2 + ; CHECK-NEXT: %38:gr16 = PHI %13, %bb.1, %13, %bb.2 + ; CHECK-NEXT: %37:gr16 = PHI %16, %bb.1, %19, %bb.2 + ; CHECK-NEXT: %36:gr64_nosp = PHI %39, %bb.1, %40, %bb.2 + ; CHECK-NEXT: %32:gr16 = PHI %13, %bb.1, %13, %bb.2 + ; CHECK-NEXT: %31:gr16 = PHI %12, %bb.1, %12, %bb.2 + ; CHECK-NEXT: %30:gr64_nosp = PHI %33, %bb.1, %34, %bb.2 + ; CHECK-NEXT: LDTILECFG + ; CHECK-NEXT: %47:gr64_nosp = MOV64ri 64 + ; CHECK-NEXT: %6:tile = PTILELOADDV %43, %44, %42, 1, killed %47, 0, $noreg + ; CHECK-NEXT: %41:gr64_nosp = MOV64ri 64 + ; CHECK-NEXT: %7:tile = PTILELOADDV %37, %38, %36, 1, killed %41, 0, $noreg + ; CHECK-NEXT: %35:gr64_nosp = MOV64ri 64 + ; CHECK-NEXT: %8:tile = PTILELOADDV %31, %32, %30, 1, killed %35, 0, $noreg + + %6:tile = PHI %0, %bb.1, %3, %bb.2 + %7:tile = PHI %1, %bb.1, %4, %bb.2 + %8:tile = PHI %2, %bb.1, %5, %bb.2 + %20:gr16 = MOV16ri 8 + %21:tile = PTDPBSSDV %12, %13, killed %20, killed %8, killed %6, killed %7 + %22:gr64 = MOV32ri64 @buf + %23:gr64_nosp = MOV32ri64 32 + PTILESTOREDV killed %12, killed %13, killed %22, 1, killed %23, 0, $noreg, killed %21 + RET 0 + +... diff --git a/llvm/test/CodeGen/X86/AMX/amx-fastpreconfig.mir b/llvm/test/CodeGen/X86/AMX/amx-fastpreconfig.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/AMX/amx-fastpreconfig.mir @@ -0,0 +1,61 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=x86_64-- -mattr=+amx-int8,avx512f -run-pass=fastpretileconfig -o - %s | FileCheck %s + +# Test the case which has TILELOADD being mixed in psuedo AMX instruction +... +--- +name: main +alignment: 16 +tracksRegLiveness: true +registers: + - { id: 0, class: gr64_nosp } + - { id: 1, class: gr64 } + - { id: 2, class: gr16 } + - { id: 3, class: gr16 } + - { id: 4, class: tile } + - { id: 5, class: tile } + - { id: 6, class: tile } + - { id: 7, class: tile } + - { id: 8, class: gr32 } + - { id: 9, class: vr512 } +frameInfo: + maxAlignment: 16 +stack: + - { id: 0, size: 1024, alignment: 16 } + - { id: 1, size: 64, alignment: 4 } +machineFunctionInfo: {} +body: | + bb.0.entry: + ; CHECK-LABEL: name: main + ; CHECK: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0 + ; CHECK-NEXT: VMOVUPSZmr %stack.2, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.2, align 4) + ; CHECK-NEXT: MOV8mi %stack.2, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.2, align 4) + ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 32 + ; CHECK-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r %stack.0, 1, $noreg, 0, $noreg + ; CHECK-NEXT: [[MOV16ri:%[0-9]+]]:gr16 = MOV16ri 32 + ; CHECK-NEXT: [[MOV16ri1:%[0-9]+]]:gr16 = MOV16ri 8 + ; CHECK-NEXT: LDTILECFG %stack.2, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load store (s512) on %stack.2, align 4) + ; CHECK-NEXT: $tmm0 = TILELOADD [[LEA64r]], 1, [[MOV32ri64_]], 0, $noreg + ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri1]], [[MOV16ri]], [[LEA64r]], 1, [[MOV32ri64_]], 0, $noreg + ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri1]], [[MOV16ri]], [[LEA64r]], 1, [[MOV32ri64_]], 0, $noreg + ; CHECK-NEXT: [[PTILELOADDV2:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri1]], [[MOV16ri]], [[LEA64r]], 1, [[MOV32ri64_]], 0, $noreg + ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[MOV16ri1]], [[MOV16ri]], [[MOV16ri]], killed [[PTILELOADDV2]], killed [[PTILELOADDV]], killed [[PTILELOADDV1]] + ; CHECK-NEXT: PTILESTOREDV killed [[MOV16ri1]], killed [[MOV16ri]], killed [[LEA64r]], 1, killed [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]] + ; CHECK-NEXT: [[MOV32r0_:%[0-9]+]]:gr32 = MOV32r0 implicit-def dead $eflags + ; CHECK-NEXT: $eax = COPY killed [[MOV32r0_]] + ; CHECK-NEXT: RET 0, killed $eax + %0:gr64_nosp = MOV32ri64 32 + %1:gr64 = LEA64r %stack.0, 1, $noreg, 0, $noreg + %2:gr16 = MOV16ri 32 + %3:gr16 = MOV16ri 8 + $tmm0 = TILELOADD %1, 1, %0, 0, $noreg + %4:tile = PTILELOADDV %3, %2, %1, 1, %0, 0, $noreg + %5:tile = PTILELOADDV %3, %2, %1, 1, %0, 0, $noreg + %6:tile = PTILELOADDV %3, %2, %1, 1, %0, 0, $noreg + %7:tile = PTDPBSSDV %3, %2, %2, killed %6, killed %4, killed %5 + PTILESTOREDV killed %3, killed %2, killed %1, 1, killed %0, 0, $noreg, killed %7 + %8:gr32 = MOV32r0 implicit-def dead $eflags + $eax = COPY killed %8 + RET 0, killed $eax + +... diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -20,7 +20,6 @@ ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Lower AMX intrinsics ; CHECK-NEXT: Lower AMX type for load/store -; CHECK-NEXT: Pre AMX Tile Config ; CHECK-NEXT: Module Verifier ; CHECK-NEXT: Lower Garbage Collection Instructions ; CHECK-NEXT: Shadow Stack GC Lowering @@ -42,6 +41,7 @@ ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: X86 EFLAGS copy lowering ; CHECK-NEXT: X86 DynAlloca Expander +; CHECK-NEXT: Fast Tile Register Preconfigure ; CHECK-NEXT: Eliminate PHI nodes for register allocation ; CHECK-NEXT: Two-Address instruction pass ; CHECK-NEXT: Fast Register Allocator