Index: llvm/include/llvm/CodeGen/Passes.h =================================================================== --- llvm/include/llvm/CodeGen/Passes.h +++ llvm/include/llvm/CodeGen/Passes.h @@ -501,6 +501,9 @@ /// or split the data to two <128 x i32>. FunctionPass *createX86LowerAMXTypePass(); + /// The pass insert tile config intrinsics for AMX fast register allocation. + FunctionPass *createX86PreAMXConfigPass(); + /// The pass transforms amx intrinsics to scalar operation if the function has /// optnone attribute or it is O0. FunctionPass *createX86LowerAMXIntrinsicsPass(); Index: llvm/lib/CodeGen/TargetPassConfig.cpp =================================================================== --- llvm/lib/CodeGen/TargetPassConfig.cpp +++ llvm/lib/CodeGen/TargetPassConfig.cpp @@ -1316,6 +1316,9 @@ report_fatal_error("Must use fast (default) register allocator for unoptimized regalloc."); addPass(createRegAllocPass(false)); + + // Allow targets to change the register assignments before rewriting. + addPreRewrite(); return true; } Index: llvm/lib/Target/X86/CMakeLists.txt =================================================================== --- llvm/lib/Target/X86/CMakeLists.txt +++ llvm/lib/Target/X86/CMakeLists.txt @@ -34,8 +34,10 @@ X86DiscriminateMemOps.cpp X86LowerTileCopy.cpp X86LowerAMXType.cpp + X86PreAMXConfig.cpp X86LowerAMXIntrinsics.cpp X86TileConfig.cpp + X86FastTileConfig.cpp X86PreTileConfig.cpp X86ExpandPseudo.cpp X86FastISel.cpp Index: llvm/lib/Target/X86/X86.h =================================================================== --- llvm/lib/Target/X86/X86.h +++ llvm/lib/Target/X86/X86.h @@ -79,6 +79,9 @@ /// Return a pass that config the tile registers. FunctionPass *createX86TileConfigPass(); +/// Return a pass that config the tile registers after fast reg allocation. +FunctionPass *createX86FastTileConfigPass(); + /// Return a pass that insert pseudo tile config instruction. FunctionPass *createX86PreTileConfigPass(); @@ -172,8 +175,10 @@ void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &); void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &); void initializeX86PreTileConfigPass(PassRegistry &); +void initializeX86FastTileConfigPass(PassRegistry &); void initializeX86TileConfigPass(PassRegistry &); void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &); +void initializeX86PreAMXConfigPassPass(PassRegistry &); void initializeX86LowerTileCopyPass(PassRegistry &); void initializeX86LowerAMXIntrinsicsLegacyPassPass(PassRegistry &); Index: llvm/lib/Target/X86/X86FastTileConfig.cpp =================================================================== --- /dev/null +++ llvm/lib/Target/X86/X86FastTileConfig.cpp @@ -0,0 +1,304 @@ +//===-- X86FastTileConfig.cpp - Fast Tile Register Configure---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file Pass to config the shape of AMX physical registers +/// AMX register need to be configured before use. Before FastRegAllocation pass +/// the ldtilecfg instruction is inserted, however at that time we don't +/// know the shape of each physical tile registers, because the register +/// allocation is not done yet. This pass runs after egister allocation +/// pass. It collects the shape information of each physical tile register +/// and store the shape in the stack slot that is allocated for load config +/// to tile config register. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86MachineFunctionInfo.h" +#include "X86RegisterInfo.h" +#include "X86Subtarget.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "fasttileconfig" + +namespace { + +class X86FastTileConfig : public MachineFunctionPass { + // context + MachineFunction *MF = nullptr; + const X86Subtarget *ST = nullptr; + const TargetRegisterInfo *TRI; + const TargetInstrInfo *TII; + MachineRegisterInfo *MRI = nullptr; + + MachineInstr *getTileConfigPoint(); + void tileConfig(); + +public: + X86FastTileConfig() : MachineFunctionPass(ID) {} + + void fastTileConfig(); + bool isTileLoad(MachineInstr &MI); + bool isTileStore(MachineInstr &MI); + bool isAMXInstr(MachineInstr &MI); + void getTileStoreShape(MachineInstr &MI, + SmallVector &ShapedTiles); + + MachineInstr *getKeyAMXInstr(MachineInstr *MI); + unsigned getTileShapesCfg(MachineInstr *MI, + SmallVector &ShapedTiles); + void getShapeCfgInstrs(MachineInstr *MI, + std::map &RowCfgs, + std::map &ColCfgs); + + /// Return the pass name. + StringRef getPassName() const override { + return "Fast Tile Register Configure"; + } + + void materializeTileCfg(MachineInstr *MI); + + void rewriteTileCfg(SmallVector &ShapedTiles, + std::map &RowCfgs, + std::map &ColCfgs); + + /// Perform register allocation. + bool runOnMachineFunction(MachineFunction &MFunc) override; + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoPHIs); + } + + static char ID; +}; + +} // end anonymous namespace + +char X86FastTileConfig::ID = 0; + +INITIALIZE_PASS_BEGIN(X86FastTileConfig, DEBUG_TYPE, + "Fast Tile Register Configure", false, false) +INITIALIZE_PASS_END(X86FastTileConfig, DEBUG_TYPE, + "Fast Tile Register Configure", false, false) + +static bool isTilePhysReg(MachineOperand &Op) { + if (!Op.isReg()) + return false; + + Register Reg = Op.getReg(); + if (Reg >= X86::TMM0 && Reg <= X86::TMM7) + return true; + return false; +} + +static unsigned getTilePhysRegIdx(MachineOperand *Op) { + assert(isTilePhysReg(*Op) && "Tile Operand is invalid"); + return Op->getReg() - X86::TMM0; +} + +static inline void adjustRowCfg(unsigned TIdx, MachineInstr *MI) { + unsigned Offset = 48 + TIdx; + MI->getOperand(3).ChangeToImmediate(Offset); +} + +static inline void adjustColCfg(unsigned TIdx, MachineInstr *MI) { + unsigned Offset = 16 + TIdx * 2; + MI->getOperand(3).ChangeToImmediate(Offset); +} + +bool X86FastTileConfig::isTileLoad(MachineInstr &MI) { + return MI.getOpcode() == X86::PTILELOADDV; +} +bool X86FastTileConfig::isTileStore(MachineInstr &MI) { + return MI.getOpcode() == X86::PTILESTOREDV; +} +bool X86FastTileConfig::isAMXInstr(MachineInstr &MI) { + // TODO: May need to handle some special nontile amx instrucion. + if (MI.getOpcode() == X86::LDTILECFG) + return false; + + for (MachineOperand &MO : MI.operands()) + if (isTilePhysReg(MO)) + return true; + + return false; +} + +MachineInstr *X86FastTileConfig::getKeyAMXInstr(MachineInstr *MI) { + auto Cfg = MachineBasicBlock::iterator(MI); + MachineBasicBlock *MBB = MI->getParent(); + MachineInstr *KeyMI = nullptr; + int KeyAMXNum = 0; + + for (auto II = Cfg; II != MBB->end(); II++) { + if (isTileLoad(*II)) { + KeyMI = &*II; + continue; + } + if (isTileStore(*II)) { + if (!KeyMI) + KeyMI = &*II; + else + break; + } + if (isAMXInstr(*II)) { + assert((KeyAMXNum == 0) && "Too many Key AMX instruction!"); + KeyAMXNum++; + KeyMI = &*II; + } + } + assert(KeyMI && "There must be an AMX instruction."); + return KeyMI; +} + +// Orderly get the tiles in key amx instruction, uses before defs. +unsigned X86FastTileConfig::getTileShapesCfg( + MachineInstr *CfgMI, SmallVector &ShapedTiles) { + MachineInstr *KeyMI = getKeyAMXInstr(CfgMI); + + SmallVector DefTiles; + for (MachineOperand &MO : KeyMI->operands()) { + if (!isTilePhysReg(MO)) + continue; + if (MO.isDef()) + DefTiles.push_back(&MO); + else + ShapedTiles.push_back(&MO); + } + ShapedTiles.append(DefTiles); + + return ShapedTiles.size(); +} + +// We pre-config the shapes at position named with "amx.tmm.N.shape.row* and +// amx.shape.N.col*" at pass "Pre AMX Tile Config". +// The 'N' implies the order of tiles in key amx intrinsic. +void X86FastTileConfig::getShapeCfgInstrs( + MachineInstr *MI, std::map &RowCfgs, + std::map &ColCfgs) { + auto Cfg = MachineBasicBlock::iterator(MI); + MachineBasicBlock *MBB = MI->getParent(); + + for (auto II = Cfg; II != MBB->begin(); II--) { + if (isAMXInstr(*II) || II->isTerminator() || II->isCall()) + break; + if (!II->mayStore() || !II->hasOneMemOperand()) + continue; + const Value *MemPtr = II->memoperands()[0]->getValue(); + if (!MemPtr) + continue; + + StringRef Name = MemPtr->getName(); + if (!Name.startswith("amx.tmm.")) + continue; + + // Get the 'N'th tile shape config in key amx instruction. + auto N = Name.find(".shape"); + StringRef STileIdx = Name.slice(8, N); + unsigned Idx; + STileIdx.getAsInteger(10, Idx); + + // And related them with their store instructions. + if (Name.contains("row")) + RowCfgs[Idx] = &*II; + else if (Name.contains("col")) + ColCfgs[Idx] = &*II; + else + llvm_unreachable("Invalid tile shape info!"); + } + assert((RowCfgs.size() == ColCfgs.size()) && + "The number of tile row and col must be equal!"); +} + +// Here is the data format for the tile config. +// 0 palette = 0 now. +// 1 start_row = 0 now. +// 2-15 reserved, must be zero +// 16-17 tile0.colsb Tile 0 bytes per row. +// 18-19 tile1.colsb Tile 1 bytes per row. +// 20-21 tile2.colsb Tile 2 bytes per row. +// ... (sequence continues) +// 30-31 tile7.colsb Tile 7 bytes per row. +// 32-47 reserved, must be zero +// 48 tile0.rows Tile 0 rows. +// 49 tile1.rows Tile 1 rows. +// 50 tile2.rows Tile 2 rows. +// ... (sequence continues) +// 55 tile7.rows Tile 7 rows. +// 56-63 reserved, must be zero +void X86FastTileConfig::rewriteTileCfg( + SmallVector &ShapedTiles, + std::map &RowCfgs, + std::map &ColCfgs) { + assert((RowCfgs.size() == ShapedTiles.size()) && + "The number of tile shapes not equal with the number of tiles!"); + + // Orderly get the tiles and adjust the shape config. + for (unsigned I = 0, E = ShapedTiles.size(); I < E; I++) { + MachineOperand *MO = ShapedTiles[I]; + unsigned TmmIdx = getTilePhysRegIdx(MO); + if (I == TmmIdx) + continue; + adjustRowCfg(TmmIdx, RowCfgs[I]); + adjustColCfg(TmmIdx, ColCfgs[I]); + } +} + +// We have already preconfig the shapes before fast register allocation at +// X86PreAMXConfig::preWriteTileCfg(). Now, we have done fast register +// allocation, the shapes pre-written before may not rightly corresponding +// to the correct tmm registers, so we need adjust them. +void X86FastTileConfig::materializeTileCfg(MachineInstr *CfgMI) { + SmallVector ShapedTiles; + std::map RowCfgs; + std::map ColCfgs; + + // Orderly keep the tile uses and def in ShapedTiles; + unsigned NumCfg = getTileShapesCfg(CfgMI, ShapedTiles); + assert(NumCfg && "Not find shapes config!"); + + getShapeCfgInstrs(CfgMI, RowCfgs, ColCfgs); + + rewriteTileCfg(ShapedTiles, RowCfgs, ColCfgs); +} + +void X86FastTileConfig::fastTileConfig() { + for (MachineBasicBlock &MBB : *MF) { + SmallVector CFGs; + for (MachineInstr &MI : MBB) + if (MI.getOpcode() == X86::LDTILECFG) + CFGs.push_back(&MI); + for (auto *MI : CFGs) + materializeTileCfg(MI); + } +} + +bool X86FastTileConfig::runOnMachineFunction(MachineFunction &MFunc) { + MF = &MFunc; + MRI = &MFunc.getRegInfo(); + ST = &MFunc.getSubtarget(); + TRI = ST->getRegisterInfo(); + TII = MFunc.getSubtarget().getInstrInfo(); + + fastTileConfig(); + return true; +} + +FunctionPass *llvm::createX86FastTileConfigPass() { + return new X86FastTileConfig(); +} Index: llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp =================================================================== --- llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp +++ llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp @@ -34,6 +34,7 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/LoopUtils.h" @@ -52,6 +53,10 @@ } #endif +static cl::opt + X86ScalarizeAMX("enable-x86-scalar-amx", cl::init(false), + cl::Hidden, cl::desc("X86: enable AMX scalarizition.")); + namespace { class X86LowerAMXIntrinsics { Function &Func; @@ -87,6 +92,7 @@ lowerTileDP(Instruction *TileDP); bool lowerTileZero(Instruction *TileZero); }; +} // anonymous namespace BasicBlock *X86LowerAMXIntrinsics::createLoop(BasicBlock *Preheader, BasicBlock *Exit, Value *Bound, @@ -564,9 +570,6 @@ return C; } -} // anonymous namespace - -namespace { class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass { public: @@ -578,6 +581,8 @@ } bool runOnFunction(Function &F) override { + if (!X86ScalarizeAMX) + return false; TargetMachine *TM = &getAnalysis().getTM(); if (!F.hasFnAttribute(Attribute::OptimizeNone) && TM->getOptLevel() != CodeGenOpt::None) @@ -601,8 +606,6 @@ } }; -} // anonymous namespace - static const char PassName[] = "Lower AMX intrinsics"; char X86LowerAMXIntrinsicsLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(X86LowerAMXIntrinsicsLegacyPass, DEBUG_TYPE, PassName, Index: llvm/lib/Target/X86/X86LowerAMXType.cpp =================================================================== --- llvm/lib/Target/X86/X86LowerAMXType.cpp +++ llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -1,4 +1,4 @@ -//===- llvm/CodeGen/TileShapeInfo.h - ---------------------------*- C++ -*-===// +//===- Target/X86/X86LowerAMXType.cpp - -------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -14,6 +14,27 @@ /// load/store <256 x i32> instruction to AMX load/store. If the bitcast can /// not be combined with load/store, we transform the bitcast to amx load/store /// and <256 x i32> store/load. +/// +/// If Front End not use O0 but the Mid/Back end use O0, (e.g. "Clang -O2 -S +/// -emit-llvm t.c" + "llc t.ll") we should make sure the amx data is volatile, +/// because that is nessary for AMX fast register allocation. (In Fast register +/// allocation, register will be allocated before spill/reload, so there is no +/// additional register for amx to identify the step in spill/reload.) +/// The volatileTileData() will handle this case. +/// e.g. +/// ---------------------------------------------------------- +/// | def %td = ... | +/// | ... | +/// | "use %td" | +/// ---------------------------------------------------------- +/// will transfer to --> +/// ---------------------------------------------------------- +/// | def %td = ... | +/// | call void @llvm.x86.tilestored64.internal(mem, %td) | +/// | ... | +/// | %td2 = call x86_amx @llvm.x86.tileloadd64.internal(mem)| +/// | "use %td2" | +/// ---------------------------------------------------------- // //===----------------------------------------------------------------------===// // @@ -41,7 +62,8 @@ #define DEBUG_TYPE "lower-amx-type" -static AllocaInst *CreateAllocaInst(IRBuilder<> &Builder, BasicBlock *BB) { +static AllocaInst *createAllocaInstAtEntry(IRBuilder<> &Builder, + BasicBlock *BB) { Function &F = *BB->getParent(); Module *M = BB->getModule(); const DataLayout &DL = M->getDataLayout(); @@ -164,7 +186,7 @@ auto *Src = Bitcast->getOperand(0); auto Prepare = [&]() { - AllocaAddr = CreateAllocaInst(Builder, Bitcast->getParent()); + AllocaAddr = createAllocaInstAtEntry(Builder, Bitcast->getParent()); I8Ptr = Builder.CreateBitCast(AllocaAddr, Builder.getInt8PtrTy()); Stride = Builder.getInt64(64); }; @@ -322,6 +344,259 @@ } } // anonymous namespace +static Value *getAllocaPos(BasicBlock *BB) { + Module *M = BB->getModule(); + Function *F = BB->getParent(); + IRBuilder<> Builder(&F->getEntryBlock().front()); + const DataLayout &DL = M->getDataLayout(); + unsigned AllocaAS = DL.getAllocaAddrSpace(); + Type *V256I32Ty = VectorType::get(Builder.getInt32Ty(), 256, false); + AllocaInst *AllocaRes = + new AllocaInst(V256I32Ty, AllocaAS, "", &F->getEntryBlock().front()); + BasicBlock::iterator Iter = AllocaRes->getIterator(); + ++Iter; + Builder.SetInsertPoint(&*Iter); + Value *I8Ptr = Builder.CreateBitCast(AllocaRes, Builder.getInt8PtrTy()); + return I8Ptr; +} + +static Instruction *createTileStore(Instruction *TileDef, Value *Ptr) { + assert(TileDef->getType()->isX86_AMXTy() && "Not define tile!"); + auto *II = cast(TileDef); + assert(II && "Not tile intrinsic!"); + Value *Row = II->getOperand(0); + Value *Col = II->getOperand(1); + + BasicBlock *BB = TileDef->getParent(); + BasicBlock::iterator Iter = TileDef->getIterator(); + IRBuilder<> Builder(BB, ++Iter); + Value *Stride = Builder.getInt64(64); + std::array Args = {Row, Col, Ptr, Stride, TileDef}; + + Instruction *TileStore = + Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args); + return TileStore; +} + +static void replaceWithTileLoad(Use &U, Value *Ptr, bool IsPHI = false) { + Value *V = U.get(); + assert(V->getType()->isX86_AMXTy() && "Not define tile!"); + + // Get tile shape. + IntrinsicInst *II = nullptr; + if (IsPHI) { + Value *PhiOp = dyn_cast(V)->getIncomingValue(0); + II = cast(PhiOp); + } else + II = cast(V); + Value *Row = II->getOperand(0); + Value *Col = II->getOperand(1); + + Instruction *UserI = dyn_cast(U.getUser()); + IRBuilder<> Builder(UserI); + Value *Stride = Builder.getInt64(64); + std::array Args = {Row, Col, Ptr, Stride}; + + Value *TileLoad = + Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, None, Args); + UserI->replaceUsesOfWith(V, TileLoad); +} + +static bool isIncomingOfPHI(Instruction *I) { + for (Use &U : I->uses()) { + User *V = U.getUser(); + if (isa(V)) + return true; + } + return false; +} + +// Let all AMX tile data become volatile data, shorten the life range +// of each tile register before fast register allocation. +namespace { +class X86VolatileTileData { + Function &F; + +public: + X86VolatileTileData(Function &Func) : F(Func) {} + Value *updatePhiIncomings(BasicBlock *BB, + SmallVector &Imcomings); + void replacePhiDefWithLoad(Instruction *PHI, Value *StorePtr); + bool volatileTileData(); + void volatileTilePHI(PHINode *Inst); + void volatileTileNonPHI(Instruction *I); +}; + +Value *X86VolatileTileData::updatePhiIncomings( + BasicBlock *BB, SmallVector &Imcomings) { + Value *I8Ptr = getAllocaPos(BB); + + for (auto *I : Imcomings) { + User *Store = createTileStore(I, I8Ptr); + + // All its uses (except phi) should load from stored mem. + for (Use &U : I->uses()) { + User *V = U.getUser(); + if (isa(V) || V == Store) + continue; + replaceWithTileLoad(U, I8Ptr); + } + } + return I8Ptr; +} + +void X86VolatileTileData::replacePhiDefWithLoad(Instruction *PHI, + Value *StorePtr) { + for (Use &U : PHI->uses()) + replaceWithTileLoad(U, StorePtr, true); + PHI->eraseFromParent(); +} + +// Smilar with volatileTileNonPHI, this function only handle PHI Nodes +// and their related AMX intrinsics. +// 1) PHI Def should change to tileload. +// 2) PHI Incoming Values should tilestored in just after their def. +// 3) The mem of these tileload and tilestores should be same. +// e.g. +// ------------------------------------------------------ +// bb_dom: +// ... +// br i1 %bool.cond, label %if.else, label %if.then +// +// if.then: +// def %t0 = ... +// ... +// use %t0 +// ... +// br label %if.end +// +// if.else: +// def %t1 = ... +// br label %if.end +// +// if.end: +// %td = phi x86_amx [ %t1, %if.else ], [ %t0, %if.then ] +// ... +// use %td +// ------------------------------------------------------ +// --> +// ------------------------------------------------------ +// bb_entry: +// %mem = alloca <256 x i32>, align 1024 * +// ... +// bb_dom: +// ... +// br i1 %bool.cond, label %if.else, label %if.then +// +// if.then: +// def %t0 = ... +// call void @llvm.x86.tilestored64.internal(mem, %t0) * +// ... +// %t0` = call x86_amx @llvm.x86.tileloadd64.internal(mem)* +// use %t0` * +// ... +// br label %if.end +// +// if.else: +// def %t1 = ... +// call void @llvm.x86.tilestored64.internal(mem, %t1) * +// br label %if.end +// +// if.end: +// ... +// %td = call x86_amx @llvm.x86.tileloadd64.internal(mem) * +// use %td +// ------------------------------------------------------ +void X86VolatileTileData::volatileTilePHI(PHINode *PHI) { + BasicBlock *BB = PHI->getParent(); + SmallVector Imcomings; + + for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) { + Value *Op = PHI->getIncomingValue(i); + Instruction *I = dyn_cast(Op); + assert(I && "We shouldn't fold AMX instrution!"); + Imcomings.push_back(I); + } + + Value *StorePtr = updatePhiIncomings(BB, Imcomings); + replacePhiDefWithLoad(PHI, StorePtr); +} + +// Store the defined tile and load it before use. +// All its users are not PHI. +// e.g. +// ------------------------------------------------------ +// def %td = ... +// ... +// "use %td" +// ------------------------------------------------------ +// --> +// ------------------------------------------------------ +// def %td = ... +// call void @llvm.x86.tilestored64.internal(mem, %td) +// ... +// %td2 = call x86_amx @llvm.x86.tileloadd64.internal(mem) +// "use %td2" +// ------------------------------------------------------ +void X86VolatileTileData::volatileTileNonPHI(Instruction *I) { + BasicBlock *BB = I->getParent(); + Value *I8Ptr = getAllocaPos(BB); + User *Store = createTileStore(I, I8Ptr); + + // All its uses should load from stored mem. + for (Use &U : I->uses()) { + User *V = U.getUser(); + assert(!isa(V) && "PHI Nodes should be excluded!"); + if (V != Store) + replaceWithTileLoad(U, I8Ptr); + } +} + +// Volatile Tile Model: +// 1) All the uses of tile data comes from tileload in time. +// 2) All the defs of tile data tilestore into mem immediately. +// For example: +// -------------------------------------------------------------------------- +// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) key +// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...) +// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) amx +// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(m, n, k, t1, t2, t3) +// call void @llvm.x86.tilestored64.internal(... td) area +// -------------------------------------------------------------------------- +// 3) No terminator, call or other amx instructions in the key amx area. +bool X86VolatileTileData::volatileTileData() { + bool Changed = false; + for (BasicBlock &BB : F) { + SmallVector PHIInsts; + SmallVector AMXDefInsts; + + for (Instruction &I : BB) { + if (!I.getType()->isX86_AMXTy()) + continue; + if (isa(&I)) + PHIInsts.push_back(&I); + else + AMXDefInsts.push_back(&I); + } + + // First we "volatile" the non-phi related amx intrinsics. + for (Instruction *I : AMXDefInsts) { + if (isIncomingOfPHI(I)) + continue; + volatileTileNonPHI(I); + Changed = true; + } + + for (Instruction *I : PHIInsts) { + volatileTilePHI(dyn_cast(I)); + Changed = true; + } + } + return Changed; +} + +} // anonymous namespace + namespace { class X86LowerAMXTypeLegacyPass : public FunctionPass { @@ -334,11 +609,27 @@ bool runOnFunction(Function &F) override { TargetMachine *TM = &getAnalysis().getTM(); - if (F.hasFnAttribute(Attribute::OptimizeNone) || - TM->getOptLevel() == CodeGenOpt::None) - return false; + // if (F.hasFnAttribute(Attribute::OptimizeNone) || + // TM->getOptLevel() == CodeGenOpt::None) + // return false; + X86LowerAMXType LAT(F); bool C = LAT.visit(); + + // Prepare for fast register allocation at O0. + // Todo: May better check the volatile model of AMX code, not just + // by checking Attribute::OptimizeNone and CodeGenOpt::None. + if (TM->getOptLevel() == CodeGenOpt::None) { + // If Front End not use O0 but the Mid/Back end use O0, (e.g. + // "Clang -O2 -S -emit-llvm t.c" + "llc t.ll") we should make + // sure the amx data is volatile, that is nessary for AMX fast + // register allocation. + if (!F.hasFnAttribute(Attribute::OptimizeNone)) { + X86VolatileTileData VTD(F); + C = VTD.volatileTileData() || C; + } + } + return C; } Index: llvm/lib/Target/X86/X86PreAMXConfig.cpp =================================================================== --- /dev/null +++ llvm/lib/Target/X86/X86PreAMXConfig.cpp @@ -0,0 +1,409 @@ +//===- Target/X86/X86PreAMXConfig.cpp - ------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// Insert tilecfg for each area of key AMX intrinsic. +/// All the key AMX intrinsic's tile operand must come from tileload. And the +/// def tile of key AMX intrinsic must be tilestored. +/// take tdpbssd for example: +/// -------------------------------------------------------------------------- +/// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(...) key +/// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(...) | +/// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(...) amx +/// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(t1, t2, t3) | +/// call void @llvm.x86.tilestored64.internal(... td) area +/// -------------------------------------------------------------------------- +/// This pass will insert tilecfg before every key-amx-area, some like: +/// -------------------------------------------------------------------------- +/// %cfgmem = alloca <16 x i32>, align 4 * allocate mem +/// store <16 x i32> zeroinitializer, <16 x i32>* %cfgmem * zero init +/// ... +/// ... pre-config shape of %t1 * +/// store volatile i8 %m, i8* %amx.tmm.0.shape.row, align 1 * +/// store volatile i16 %k, i16* %amx.tmm.0.shape.col, align 2 * pre-config +/// ... * +/// ... pre-config shape of %t2 * shapes +/// store volatile i8 %k, i8* %amx.tmm.1.shape.row, align 1 * +/// store volatile i16 %n, i16* %amx.tmm.1.shape.col, align 2 * +/// ... +/// call void @llvm.x86.ldtilecfg(i8* %cfgmem) * tile config +// +//===----------------------------------------------------------------------===// +// +#include "X86.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsX86.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "pre-amx-config" + +static bool isAMXIntrinsic(IntrinsicInst *II) { + for (Value *Operand : II->operands()) + if (Operand->getType()->isX86_AMXTy()) + return true; + return II->getType()->isX86_AMXTy(); +} + +static bool isTileLoad(IntrinsicInst *II) { + return II->getIntrinsicID() == Intrinsic::x86_tileloadd64_internal; +} + +static bool isTileStore(IntrinsicInst *II) { + return II->getIntrinsicID() == Intrinsic::x86_tilestored64_internal; +} + +static bool brokenVolatile(Instruction *I) { + // Todo: it is weak to identify a normal call here. + if ((isa(I) && !isa(I)) || I->isTerminator()) + return true; + return false; +} + +namespace { +class X86PreAMXConfig { + Function &F; + +public: + X86PreAMXConfig(Function &Func) : F(Func) {} + bool preTileConfig(); + bool addTileConfig(Instruction *FirstLoad, SmallVector &Shapes); + bool findConfigShapes( + DenseMap> &PosAndShapes); + bool getKeyAMXShapes(IntrinsicInst *KeyAMX, SmallVector &Shapes); + bool preWriteTileCfg(Value *I8Ptr, Instruction *Pos, + SmallVector &Shapes); + BasicBlock::iterator + getShapesAndConfigPosEnd(BasicBlock::iterator Iter, + SmallVector &Shapes); + bool checkVolatileModel(SmallSet &Loads, IntrinsicInst *Store, + IntrinsicInst *KeyAMX); +}; + +// Orderly write the shapes in tilecfg's mem. This maybe not right. +// Because the first shape may not corresponding to the first tmm register, +// so we need to handle at at X86FastTileConfig::materializeTileCfg() +// after register allocation. +// For example: +// -------------------------------------------------------------------------- +// zeroinitialize tilecfg's mem (of ldtilecfg) +// -------------------------------------------------------------------------- +// ... pre-config shape of %t1 * +// %amx.tmm.0.shape.row = getelementptr i8, i8* %mem, i64 48 * +// %amx.tmm.0.shape.col = getelementptr i16, i16* %mem, i64 16 * +// store volatile i8 %m, i8* %amx.tmm.0.shape.row, align 1 * +// store volatile i16 %k, i16* %amx.tmm.0.shape.col, align 2 * pre-config +// ... * +// ... pre-config shape of %t2 * +// %amx.tmm.1.shape.row = getelementptr i8, i8* %mem, i64 49 * +// %amx.tmm.1.shape.col = getelementptr i16, i16* %mem, i64 18 * +// store volatile i8 %k, i8* %amx.tmm.1.shape.row, align 1 * shapes +// store volatile i16 %n, i16* %amx.tmm.1.shape.col, align 2 * +// ... * +// ... pre-config shape of %t3 * of +// %amx.tmm.2.shape.row = getelementptr i8, i8* %mem, i64 50 * +// %amx.tmm.2.shape.col = getelementptr i16, i16* %mem, i64 20 * +// store volatile i8 %m, i8* %amx.tmm.2.shape.row, align 1 * +// store volatile i16 %n, i16* %amx.tmm.2.shape.col, align 2 * +// ... * tiles +// ... pre-config shape of %td * +// %amx.tmm.3.shape.row = getelementptr i8, i8* %mem, i64 51 * +// %amx.tmm.3.shape.col = getelementptr i16, i16* %mem, i64 22 * +// store volatile i8 %m, i8* %amx.tmm.3.shape.row, align 1 * +// store volatile i16 %n, i16* %amx.tmm.3.shape.col, align 2 * +// -------------------------------------------------------------------------- +// call void @llvm.x86.ldtilecfg(i8* %mem) * tile config +// -------------------------------------------------------------------------- +// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) key +// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...) +// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) amx +// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(m, n, k, t1, t2, t3) +// call void @llvm.x86.tilestored64.internal(... td) area +// -------------------------------------------------------------------------- +bool X86PreAMXConfig::preWriteTileCfg(Value *I8Ptr, Instruction *Pos, + SmallVector &Shapes) { + bool Write = false; + LLVMContext &Ctx = Pos->getParent()->getContext(); + Type *I8Ty = Type::getInt8Ty(Ctx); + Type *I16Ty = Type::getInt16Ty(Ctx); + + for (int i = 0, e = Shapes.size() / 2; i < e; i++) { + Value *RowOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 48 + i); + Value *ColOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 16 + i * 2); + const std::string ShapeName = "amx.tmm." + itostr(i); + Value *RowPos = GetElementPtrInst::Create(I8Ty, I8Ptr, RowOffset, + ShapeName + ".shape.row", Pos); + Value *ColPos = GetElementPtrInst::Create(I8Ty, I8Ptr, ColOffset, "", Pos); + ColPos = new BitCastInst(ColPos, PointerType::get(I16Ty, 0), + ShapeName + ".shape.col", Pos); + Value *Row = Shapes[i * 2]; + Value *Col = Shapes[i * 2 + 1]; + Row = new TruncInst(Row, I8Ty, "", Pos); + new StoreInst(Row, RowPos, "", Pos); + new StoreInst(Col, ColPos, "", Pos); + Write = true; + } + return Write; +} + +bool X86PreAMXConfig::addTileConfig(Instruction *FirstLoad, + SmallVector &Shapes) { + Module *M = F.getParent(); + IRBuilder<> Builder(FirstLoad); + const DataLayout &DL = M->getDataLayout(); + unsigned AddrSpace = DL.getAllocaAddrSpace(); + LLVMContext &Ctx = Builder.getContext(); + Type *V512Ty = VectorType::get(Builder.getInt32Ty(), 16, false); + Align Alignment = DL.getPrefTypeAlign(Type::getInt32Ty(Ctx)); + + AllocaInst *Addr = + new AllocaInst(V512Ty, AddrSpace, "", &F.getEntryBlock().front()); + Addr->setAlignment(Alignment); + Value *I8Ptr = Builder.CreateBitCast(Addr, Builder.getInt8PtrTy()); + + std::array Args = {I8Ptr}; + Instruction *Cfg = + Builder.CreateIntrinsic(Intrinsic::x86_ldtilecfg, None, Args); + + Value *Val0 = Constant::getNullValue(V512Ty); + Instruction *Init0 = new StoreInst(Val0, Addr, Cfg); + assert(Init0 && "Not Zero initilizate the cfg mem!"); + + preWriteTileCfg(I8Ptr, Cfg, Shapes); + + return Cfg; +} + +// Todo: We may need to handle "more than one store" case in the future. +bool X86PreAMXConfig::checkVolatileModel(SmallSet &Loads, + IntrinsicInst *Store, + IntrinsicInst *KeyAMX) { + Value *ST = Store->getOperand(4); + + // Only has tileload and tilestore. + if (!KeyAMX) { + if (Loads.size() != 1) + return false; + return Loads.contains(ST); + } + + // All Loads should be operands of KeyAMX. + // All tile operands of KeyAMX should come from Loads. + for (Value *Op : KeyAMX->operands()) { + if (Op->getType()->isX86_AMXTy()) + if (!Loads.erase(Op)) + return false; + } + + // The def of KeyAMX should be stored. + // Todo: is it key amx can be no def? + return Loads.empty() && (ST == cast(KeyAMX)); +} + +bool X86PreAMXConfig::getKeyAMXShapes(IntrinsicInst *KeyAMX, + SmallVector &Shapes) { + for (unsigned i = 0; i < KeyAMX->getNumOperands(); i++) { + Value *Op = KeyAMX->getOperand(i); + if (!Op->getType()->isX86_AMXTy()) + continue; + IntrinsicInst *TileDef = dyn_cast(Op); + assert((TileDef && isTileLoad(TileDef)) && + "All KeyAMX's tile definiation should comes from TileLoad!"); + Shapes.push_back(TileDef->getOperand(0)); + Shapes.push_back(TileDef->getOperand(1)); + } + if (!isTileStore(KeyAMX)) { + Shapes.push_back(KeyAMX->getOperand(0)); + Shapes.push_back(KeyAMX->getOperand(1)); + } + return Shapes.size() != 0; +} + +// Collect the shapes and skip the area of current key amx intrinsic. +// +// For example: +// ... +// -------------------------------------------------------------------------- +// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) record (m,k) +// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...) record (m,k) +// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) record (m,k) +// %td = call x86_amx @llvm.x86.tdpbssd.internal(...t1, t2, t3) +// call void @llvm.x86.tilestored64.internal(m, n,... td) <--PosEnd record (m,k) +// -------------------------------------------------------------------------- +BasicBlock::iterator +X86PreAMXConfig::getShapesAndConfigPosEnd(BasicBlock::iterator Iter, + SmallVector &Shapes) { + IntrinsicInst *KeyAMX = nullptr; + BasicBlock *BB = Iter->getParent(); + BasicBlock::iterator PosEnd = BB->end(); + SmallSet Loads; + + // See TileStore as "Config Position End" and check volatile model. + for (auto I = Iter, E = BB->end(); I != E; ++I) { + assert(!brokenVolatile(&*I) && "Not reach tile store!"); + IntrinsicInst *II = dyn_cast(&*I); + if (!II || !isAMXIntrinsic(II)) + continue; + + if (isTileLoad(II)) + Loads.insert(II); + else if (isTileStore(II)) { + bool Res = checkVolatileModel(Loads, II, KeyAMX); + assert(Res && "Not Volatile AMX Model!"); + PosEnd = I; + break; + } else { + assert(!KeyAMX && "Too many key amx intrinsic!"); + KeyAMX = II; + } + } + assert(PosEnd != BB->end() && "Not find TileStore!"); + + // See KeyAMX as TileStore if only TileLoad and TileStore. + if (!KeyAMX) + KeyAMX = dyn_cast(&*PosEnd); + + // Get Shapes in order. + assert(Shapes.empty() && "Shapes should be clean."); + getKeyAMXShapes(KeyAMX, Shapes); + + return PosEnd; +} + +// Record a key amx area's shapes with its position. +// Use the first tileload as its position. +// For example: +// ... +// -------------------------------------------------------------------------- +// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) <-- pos +// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...) / +// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) shapes: +// %td = call x86_amx @llvm.x86.tdpbssd.internal(...t1, t2, t3) (m,k)(k,n) +// call void @llvm.x86.tilestored64.internal(m, n,... td) (m,n)(m,n) +// -------------------------------------------------------------------------- +bool X86PreAMXConfig::findConfigShapes( + DenseMap> &PosAndShapes) { + bool Find = false; + for (BasicBlock &BB : F) { + for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I) { + IntrinsicInst *II = dyn_cast(&*I); + if (!II) + continue; + if (!isAMXIntrinsic(II)) + continue; + assert(II->getIntrinsicID() == Intrinsic::x86_tileloadd64_internal && + "Not volatile model for AMX at O0!"); + + I = getShapesAndConfigPosEnd(I, PosAndShapes[&*I]); + Find = true; + } + } + return Find; +} + +// Insert ldtilecfg and preconfig the shapes for each area of key AMX intrinsic. +// e.g. (key amx = tdpbssd) +// -------------------------------------------------------------------------- +// %cfgmem = alloca <16 x i32>, align 4 * allocate mem +// store <16 x i32> zeroinitializer, <16 x i32>* %cfgmem * zero init +// ... +// ... pre-config shape of %t1 * +// store volatile i8 %m, i8* %amx.tmm.0.shape.row, align 1 * +// store volatile i16 %k, i16* %amx.tmm.0.shape.col, align 2 * pre-config +// ... * +// ... pre-config shape of %t2 * +// store volatile i8 %k, i8* %amx.tmm.1.shape.row, align 1 * shapes +// store volatile i16 %n, i16* %amx.tmm.1.shape.col, align 2 * +// ... * +// ... pre-config shape of %t3 * of +// store volatile i8 %m, i8* %amx.tmm.2.shape.row, align 1 * +// store volatile i16 %n, i16* %amx.tmm.2.shape.col, align 2 * +// ... * tiles +// ... pre-config shape of %td * +// store volatile i8 %m, i8* %amx.tmm.3.shape.row, align 1 * +// store volatile i16 %n, i16* %amx.tmm.3.shape.col, align 2 * +// +// call void @llvm.x86.ldtilecfg(i8* %cfgmem) * pre-config +// -------------------------------------------------------------------------- +// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) key +// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...) +// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) amx +// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(m, n, k, t1, t2, t3) +// call void @llvm.x86.tilestored64.internal(... td) area +// -------------------------------------------------------------------------- +bool X86PreAMXConfig::preTileConfig() { + DenseMap> PosAndShapes; + bool NeedCfg = findConfigShapes(PosAndShapes); + if (!NeedCfg) + return false; + for (auto &IPAndShapes : PosAndShapes) { + addTileConfig(IPAndShapes.first, IPAndShapes.second); + } + + return true; +} +} // anonymous namespace + +namespace { + +class X86PreAMXConfigPass : public FunctionPass { +public: + static char ID; + + X86PreAMXConfigPass() : FunctionPass(ID) { + initializeX86PreAMXConfigPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + TargetMachine *TM = &getAnalysis().getTM(); + bool C = false; + + // Prepare for fast register allocation at O0. + if (TM->getOptLevel() == CodeGenOpt::None) { + + // We pre-config each key AMX intrinsic at O0. + // In theory, one tile config can cover several AMX intrinsics, but + // it is very diffcult to classify the tile shapes at O0. So here we + // let thing be easy, pre-config every key AMX intrinsic. + X86PreAMXConfig PCFG(F); + C = PCFG.preTileConfig(); + } + + return C; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + } +}; + +} // anonymous namespace + +static const char PassName[] = "Pre AMX Tile Config"; +char X86PreAMXConfigPass::ID = 0; +INITIALIZE_PASS_BEGIN(X86PreAMXConfigPass, DEBUG_TYPE, PassName, false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(X86PreAMXConfigPass, DEBUG_TYPE, PassName, false, false) + +FunctionPass *llvm::createX86PreAMXConfigPass() { + return new X86PreAMXConfigPass(); +} Index: llvm/lib/Target/X86/X86TargetMachine.cpp =================================================================== --- llvm/lib/Target/X86/X86TargetMachine.cpp +++ llvm/lib/Target/X86/X86TargetMachine.cpp @@ -64,6 +64,7 @@ PassRegistry &PR = *PassRegistry::getPassRegistry(); initializeX86LowerAMXIntrinsicsLegacyPassPass(PR); initializeX86LowerAMXTypeLegacyPassPass(PR); + initializeX86PreAMXConfigPassPass(PR); initializeGlobalISel(PR); initializeWinEHStatePassPass(PR); initializeFixupBWInstPassPass(PR); @@ -74,6 +75,7 @@ initializeX86CallFrameOptimizationPass(PR); initializeX86CmovConverterPassPass(PR); initializeX86TileConfigPass(PR); + initializeX86FastTileConfigPass(PR); initializeX86LowerTileCopyPass(PR); initializeX86ExpandPseudoPass(PR); initializeX86ExecutionDomainFixPass(PR); @@ -417,6 +419,9 @@ addPass(createX86LowerAMXIntrinsicsPass()); addPass(createX86LowerAMXTypePass()); + if (TM->getOptLevel() == CodeGenOpt::None) + addPass(createX86PreAMXConfigPass()); + TargetPassConfig::addIRPasses(); if (TM->getOptLevel() != CodeGenOpt::None) { @@ -585,7 +590,11 @@ } bool X86PassConfig::addPreRewrite() { - addPass(createX86TileConfigPass()); + // Fixme: should fast reg allocation. + if (getOptLevel() == CodeGenOpt::None) + addPass(createX86FastTileConfigPass()); + else + addPass(createX86TileConfigPass()); return true; } Index: llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll @@ -0,0 +1,1017 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f | FileCheck %s --check-prefix=AMX_O0 +source_filename = "amx_api.c" + +%struct.__tile1024i_str = type <{ i16, i16, [60 x i8], <256 x i32> }> + +@buf = dso_local global [1024 x i8] zeroinitializer, align 16 +@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16 + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) #0 { +; AMX_O0-LABEL: test_api: +; AMX_O0: # %bb.0: # %entry +; AMX_O0-NEXT: pushq %rbp +; AMX_O0-NEXT: .cfi_def_cfa_offset 16 +; AMX_O0-NEXT: .cfi_offset %rbp, -16 +; AMX_O0-NEXT: movq %rsp, %rbp +; AMX_O0-NEXT: .cfi_def_cfa_register %rbp +; AMX_O0-NEXT: andq $-1024, %rsp # imm = 0xFC00 +; AMX_O0-NEXT: subq $25600, %rsp # imm = 0x6400 +; AMX_O0-NEXT: movw %dx, %ax +; AMX_O0-NEXT: movw %si, %cx +; AMX_O0-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; AMX_O0-NEXT: xorl %esi, %esi +; AMX_O0-NEXT: movl $1088, %edx # imm = 0x440 +; AMX_O0-NEXT: callq memset@PLT +; AMX_O0-NEXT: movw {{[0-9]+}}(%rsp), %ax +; AMX_O0-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw $8, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; AMX_O0-NEXT: xorl %esi, %esi +; AMX_O0-NEXT: movl $1088, %edx # imm = 0x440 +; AMX_O0-NEXT: callq memset@PLT +; AMX_O0-NEXT: movw $8, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw {{[0-9]+}}(%rsp), %ax +; AMX_O0-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; AMX_O0-NEXT: xorl %esi, %esi +; AMX_O0-NEXT: movl $1088, %edx # imm = 0x440 +; AMX_O0-NEXT: callq memset@PLT +; AMX_O0-NEXT: movw {{[0-9]+}}(%rsp), %ax +; AMX_O0-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw {{[0-9]+}}(%rsp), %ax +; AMX_O0-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: cmpl $0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: je .LBB0_2 +; AMX_O0-NEXT: # %bb.1: # %if.then +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rax +; AMX_O0-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq $buf, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq $32, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AMX_O0-NEXT: movw (%rax), %si +; AMX_O0-NEXT: movw 2(%rax), %dx +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AMX_O0-NEXT: movw %si, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %dx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw {{[0-9]+}}(%rsp), %ax +; AMX_O0-NEXT: movw {{[0-9]+}}(%rsp), %cx +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AMX_O0-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movb %al, %dil +; AMX_O0-NEXT: movb %dil, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: tileloadd (%rdx,%rsi), %tmm0 +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AMX_O0-NEXT: addq $64, %rdx +; AMX_O0-NEXT: movl $64, %esi +; AMX_O0-NEXT: tilestored %tmm0, (%rdx,%rsi) +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rax +; AMX_O0-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq $buf, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq $32, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AMX_O0-NEXT: movw (%rax), %di +; AMX_O0-NEXT: movw 2(%rax), %dx +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AMX_O0-NEXT: movw %di, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %dx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw {{[0-9]+}}(%rsp), %ax +; AMX_O0-NEXT: movw {{[0-9]+}}(%rsp), %cx +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movb %al, %r8b +; AMX_O0-NEXT: movb %r8b, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: tileloadd (%rdx,%rdi), %tmm0 +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AMX_O0-NEXT: addq $64, %rdx +; AMX_O0-NEXT: tilestored %tmm0, (%rdx,%rsi) +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rax +; AMX_O0-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq $buf, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq $32, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AMX_O0-NEXT: movw (%rax), %si +; AMX_O0-NEXT: movw 2(%rax), %dx +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AMX_O0-NEXT: movw %si, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %dx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw {{[0-9]+}}(%rsp), %ax +; AMX_O0-NEXT: movw {{[0-9]+}}(%rsp), %cx +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movb %al, %r8b +; AMX_O0-NEXT: movb %r8b, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: ldtilecfg (%rdi) +; AMX_O0-NEXT: tileloadd (%rdx,%rsi), %tmm0 +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AMX_O0-NEXT: addq $64, %rdx +; AMX_O0-NEXT: movl $64, %esi +; AMX_O0-NEXT: tilestored %tmm0, (%rdx,%rsi) +; AMX_O0-NEXT: jmp .LBB0_3 +; AMX_O0-NEXT: .LBB0_2: # %if.else +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rax +; AMX_O0-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq $buf2, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq $32, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AMX_O0-NEXT: movw (%rax), %si +; AMX_O0-NEXT: movw 2(%rax), %dx +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AMX_O0-NEXT: movw %si, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %dx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw {{[0-9]+}}(%rsp), %ax +; AMX_O0-NEXT: movw {{[0-9]+}}(%rsp), %cx +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AMX_O0-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movb %al, %dil +; AMX_O0-NEXT: movb %dil, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: tileloadd (%rdx,%rsi), %tmm0 +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AMX_O0-NEXT: addq $64, %rdx +; AMX_O0-NEXT: movl $64, %esi +; AMX_O0-NEXT: tilestored %tmm0, (%rdx,%rsi) +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rax +; AMX_O0-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq $buf2, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq $32, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AMX_O0-NEXT: movw (%rax), %di +; AMX_O0-NEXT: movw 2(%rax), %dx +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AMX_O0-NEXT: movw %di, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %dx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw {{[0-9]+}}(%rsp), %ax +; AMX_O0-NEXT: movw {{[0-9]+}}(%rsp), %cx +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movb %al, %r8b +; AMX_O0-NEXT: movb %r8b, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: tileloadd (%rdx,%rdi), %tmm0 +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AMX_O0-NEXT: addq $64, %rdx +; AMX_O0-NEXT: tilestored %tmm0, (%rdx,%rsi) +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rax +; AMX_O0-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq $buf2, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq $32, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AMX_O0-NEXT: movw (%rax), %si +; AMX_O0-NEXT: movw 2(%rax), %dx +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AMX_O0-NEXT: movw %si, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %dx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw {{[0-9]+}}(%rsp), %ax +; AMX_O0-NEXT: movw {{[0-9]+}}(%rsp), %cx +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movb %al, %r8b +; AMX_O0-NEXT: movb %r8b, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: ldtilecfg (%rdi) +; AMX_O0-NEXT: tileloadd (%rdx,%rsi), %tmm0 +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AMX_O0-NEXT: addq $64, %rdx +; AMX_O0-NEXT: movl $64, %esi +; AMX_O0-NEXT: tilestored %tmm0, (%rdx,%rsi) +; AMX_O0-NEXT: .LBB0_3: # %if.end +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; AMX_O0-NEXT: movl $1088, %edx # imm = 0x440 +; AMX_O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AMX_O0-NEXT: vzeroupper +; AMX_O0-NEXT: callq memcpy@PLT +; AMX_O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; AMX_O0-NEXT: callq memcpy@PLT +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rax +; AMX_O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AMX_O0-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw {{[0-9]+}}(%rsp), %ax +; AMX_O0-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AMX_O0-NEXT: movw {{[0-9]+}}(%rsp), %ax +; AMX_O0-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AMX_O0-NEXT: movw {{[0-9]+}}(%rsp), %ax +; AMX_O0-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AMX_O0-NEXT: vmovdqa64 64(%rax), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 128(%rax), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 192(%rax), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 256(%rax), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 320(%rax), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 384(%rax), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 448(%rax), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 512(%rax), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 576(%rax), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 640(%rax), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 704(%rax), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 768(%rax), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 832(%rax), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 896(%rax), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 960(%rax), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 1024(%rax), %zmm0 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm16 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm17 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm18 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm19 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm20 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm21 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm22 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm23 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm24 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm25 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm26 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm27 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm28 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm29 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm30 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm31 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1 +; AMX_O0-NEXT: vmovaps %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm4 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm5 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm6 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm7 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm8 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm9 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm10 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm11 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm12 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15 +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm31, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm30, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm29, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm28, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm27, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm26, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm25, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm24, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm23, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm22, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm21, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm20, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm19, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm18, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm17, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm16, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm12, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm11, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm10, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm9, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm8, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; AMX_O0-NEXT: movl $1024, %edx # imm = 0x400 +; AMX_O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AMX_O0-NEXT: vzeroupper +; AMX_O0-NEXT: callq memcpy@PLT +; AMX_O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; AMX_O0-NEXT: callq memcpy@PLT +; AMX_O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; AMX_O0-NEXT: callq memcpy@PLT +; AMX_O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Reload +; AMX_O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload +; AMX_O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AMX_O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AMX_O0-NEXT: # kill: def $r8 killed $rax +; AMX_O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm16 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm17 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm18 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm19 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm20 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm21 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm22 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm23 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm24 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm25 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm26 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm27 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm28 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm29 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm30 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm31 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1 +; AMX_O0-NEXT: vmovaps %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm4 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm5 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm6 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm7 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm8 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm9 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm10 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm11 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm12 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15 +; AMX_O0-NEXT: movw %di, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: vmovdqa64 %zmm31, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm30, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm29, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm28, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm27, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm26, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm25, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm24, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm23, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm22, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm21, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm20, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm19, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm18, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm17, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm16, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm12, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm11, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm10, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm9, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm8, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw {{[0-9]+}}(%rsp), %ax +; AMX_O0-NEXT: movw {{[0-9]+}}(%rsp), %cx +; AMX_O0-NEXT: movw {{[0-9]+}}(%rsp), %di +; AMX_O0-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AMX_O0-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movb %al, %r8b +; AMX_O0-NEXT: movb %r8b, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movb %r8b, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %di, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movb %dil, %r9b +; AMX_O0-NEXT: movb %r9b, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movb %r8b, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movl $64, %r8d +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %r9 +; AMX_O0-NEXT: tileloadd (%r9,%r8), %tmm0 +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %r9 +; AMX_O0-NEXT: tileloadd (%r9,%r8), %tmm1 +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %r9 +; AMX_O0-NEXT: tileloadd (%r9,%r8), %tmm2 +; AMX_O0-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AMX_O0-NEXT: addq $64, %rdi +; AMX_O0-NEXT: tilestored %tmm0, (%rdi,%r8) +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; AMX_O0-NEXT: vzeroupper +; AMX_O0-NEXT: callq memcpy@PLT +; AMX_O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AMX_O0-NEXT: movq $buf, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq $32, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw {{[0-9]+}}(%rsp), %ax +; AMX_O0-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AMX_O0-NEXT: movw {{[0-9]+}}(%rsp), %ax +; AMX_O0-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AMX_O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AMX_O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm4 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm5 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm6 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm7 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm8 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm9 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm10 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm11 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm12 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15 +; AMX_O0-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm12, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm11, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm10, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm9, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm8, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; AMX_O0-NEXT: vzeroupper +; AMX_O0-NEXT: callq memcpy@PLT +; AMX_O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Reload +; AMX_O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %dx # 2-byte Reload +; AMX_O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AMX_O0-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AMX_O0-NEXT: # kill: def $rdi killed $rax +; AMX_O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm4 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm5 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm6 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm7 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm8 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm9 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm10 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm11 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm12 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15 +; AMX_O0-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm16 +; AMX_O0-NEXT: movw %si, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %dx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm16, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm12, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm11, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm10, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm9, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm8, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw {{[0-9]+}}(%rsp), %ax +; AMX_O0-NEXT: movw {{[0-9]+}}(%rsp), %cx +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AMX_O0-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %r8 +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movb %al, %r9b +; AMX_O0-NEXT: movb %r9b, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: ldtilecfg (%r8) +; AMX_O0-NEXT: movl $64, %r8d +; AMX_O0-NEXT: tileloadd (%rdi,%r8), %tmm0 +; AMX_O0-NEXT: tilestored %tmm0, (%rdx,%rsi) +; AMX_O0-NEXT: movq %rbp, %rsp +; AMX_O0-NEXT: popq %rbp +; AMX_O0-NEXT: .cfi_def_cfa %rsp, 8 +; AMX_O0-NEXT: tilerelease +; AMX_O0-NEXT: vzeroupper +; AMX_O0-NEXT: retq +entry: + %m.addr.i85 = alloca i16, align 2 + %n.addr.i86 = alloca i16, align 2 + %base.addr.i87 = alloca i8*, align 8 + %stride.addr.i88 = alloca i64, align 8 + %tile.addr.i = alloca <256 x i32>, align 64 + %indirect-arg-temp.i5284 = alloca <256 x i32>, align 1024 + %m.addr.i81 = alloca i16, align 2 + %n.addr.i82 = alloca i16, align 2 + %k.addr.i = alloca i16, align 2 + %dst.addr.i83 = alloca <256 x i32>, align 64 + %src1.addr.i = alloca <256 x i32>, align 64 + %src2.addr.i = alloca <256 x i32>, align 64 + %indirect-arg-temp5.i80 = alloca <256 x i32>, align 1024 + %indirect-arg-temp4.i79 = alloca <256 x i32>, align 1024 + %indirect-arg-temp.i78 = alloca <256 x i32>, align 1024 + %m.addr.i74 = alloca i16, align 2 + %n.addr.i75 = alloca i16, align 2 + %base.addr.i76 = alloca i8*, align 8 + %stride.addr.i77 = alloca i64, align 8 + %m.addr.i70 = alloca i16, align 2 + %n.addr.i71 = alloca i16, align 2 + %base.addr.i72 = alloca i8*, align 8 + %stride.addr.i73 = alloca i64, align 8 + %m.addr.i66 = alloca i16, align 2 + %n.addr.i67 = alloca i16, align 2 + %base.addr.i68 = alloca i8*, align 8 + %stride.addr.i69 = alloca i64, align 8 + %m.addr.i62 = alloca i16, align 2 + %n.addr.i63 = alloca i16, align 2 + %base.addr.i64 = alloca i8*, align 8 + %stride.addr.i65 = alloca i64, align 8 + %m.addr.i58 = alloca i16, align 2 + %n.addr.i59 = alloca i16, align 2 + %base.addr.i60 = alloca i8*, align 8 + %stride.addr.i61 = alloca i64, align 8 + %m.addr.i = alloca i16, align 2 + %n.addr.i = alloca i16, align 2 + %base.addr.i56 = alloca i8*, align 8 + %stride.addr.i57 = alloca i64, align 8 + %base.addr.i50 = alloca i8*, align 8 + %stride.addr.i51 = alloca i64, align 8 + %indirect-arg-temp.i52 = alloca <256 x i32>, align 1024 + %c49 = alloca %struct.__tile1024i_str, align 64 + %dst.addr.i44 = alloca %struct.__tile1024i_str*, align 8 + %indirect-arg-temp.i = alloca <256 x i32>, align 1024 + %indirect-arg-temp4.i = alloca <256 x i32>, align 1024 + %indirect-arg-temp5.i = alloca <256 x i32>, align 1024 + %b43 = alloca %struct.__tile1024i_str, align 64 + %a42 = alloca %struct.__tile1024i_str, align 64 + %dst.addr.i35 = alloca %struct.__tile1024i_str*, align 8 + %base.addr.i36 = alloca i8*, align 8 + %stride.addr.i37 = alloca i64, align 8 + %dst.addr.i28 = alloca %struct.__tile1024i_str*, align 8 + %base.addr.i29 = alloca i8*, align 8 + %stride.addr.i30 = alloca i64, align 8 + %dst.addr.i21 = alloca %struct.__tile1024i_str*, align 8 + %base.addr.i22 = alloca i8*, align 8 + %stride.addr.i23 = alloca i64, align 8 + %dst.addr.i14 = alloca %struct.__tile1024i_str*, align 8 + %base.addr.i15 = alloca i8*, align 8 + %stride.addr.i16 = alloca i64, align 8 + %dst.addr.i7 = alloca %struct.__tile1024i_str*, align 8 + %base.addr.i8 = alloca i8*, align 8 + %stride.addr.i9 = alloca i64, align 8 + %dst.addr.i = alloca %struct.__tile1024i_str*, align 8 + %base.addr.i = alloca i8*, align 8 + %stride.addr.i = alloca i64, align 8 + %cond.addr = alloca i32, align 4 + %row.addr = alloca i16, align 2 + %col.addr = alloca i16, align 2 + %a = alloca %struct.__tile1024i_str, align 64 + %b = alloca %struct.__tile1024i_str, align 64 + %c = alloca %struct.__tile1024i_str, align 64 + store i32 %cond, i32* %cond.addr, align 4 + store i16 %row, i16* %row.addr, align 2 + store i16 %col, i16* %col.addr, align 2 + %0 = bitcast %struct.__tile1024i_str* %a to i8* + call void @llvm.memset.p0i8.i64(i8* align 64 %0, i8 0, i64 1088, i1 false) + %row1 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %a, i32 0, i32 0 + %1 = load i16, i16* %row.addr, align 2 + store i16 %1, i16* %row1, align 64 + %col2 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %a, i32 0, i32 1 + store i16 8, i16* %col2, align 2 + %2 = bitcast %struct.__tile1024i_str* %b to i8* + call void @llvm.memset.p0i8.i64(i8* align 64 %2, i8 0, i64 1088, i1 false) + %row3 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %b, i32 0, i32 0 + store i16 8, i16* %row3, align 64 + %col4 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %b, i32 0, i32 1 + %3 = load i16, i16* %col.addr, align 2 + store i16 %3, i16* %col4, align 2 + %4 = bitcast %struct.__tile1024i_str* %c to i8* + call void @llvm.memset.p0i8.i64(i8* align 64 %4, i8 0, i64 1088, i1 false) + %row5 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %c, i32 0, i32 0 + %5 = load i16, i16* %row.addr, align 2 + store i16 %5, i16* %row5, align 64 + %col6 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %c, i32 0, i32 1 + %6 = load i16, i16* %col.addr, align 2 + store i16 %6, i16* %col6, align 2 + %7 = load i32, i32* %cond.addr, align 4 + %tobool = icmp ne i32 %7, 0 + br i1 %tobool, label %if.then, label %if.else + +if.then: ; preds = %entry + store %struct.__tile1024i_str* %a, %struct.__tile1024i_str** %dst.addr.i35, align 8 + store i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i8** %base.addr.i36, align 8 + store i64 32, i64* %stride.addr.i37, align 8 + %8 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i35, align 8 + %row.i38 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %8, i32 0, i32 0 + %9 = load i16, i16* %row.i38, align 64 + %10 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i35, align 8 + %col.i39 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %10, i32 0, i32 1 + %11 = load i16, i16* %col.i39, align 2 + %12 = load i8*, i8** %base.addr.i36, align 8 + %13 = load i64, i64* %stride.addr.i37, align 8 + store i16 %9, i16* %m.addr.i, align 2 + store i16 %11, i16* %n.addr.i, align 2 + store i8* %12, i8** %base.addr.i56, align 8 + store i64 %13, i64* %stride.addr.i57, align 8 + %14 = load i16, i16* %m.addr.i, align 2 + %15 = load i16, i16* %n.addr.i, align 2 + %16 = load i8*, i8** %base.addr.i56, align 8 + %17 = load i64, i64* %stride.addr.i57, align 8 + %18 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %14, i16 %15, i8* %16, i64 %17) #2 + %19 = bitcast x86_amx %18 to <256 x i32> + %20 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i35, align 8 + %tile.i41 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %20, i32 0, i32 3 + store <256 x i32> %19, <256 x i32>* %tile.i41, align 64 + store %struct.__tile1024i_str* %b, %struct.__tile1024i_str** %dst.addr.i28, align 8 + store i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i8** %base.addr.i29, align 8 + store i64 32, i64* %stride.addr.i30, align 8 + %21 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i28, align 8 + %row.i31 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %21, i32 0, i32 0 + %22 = load i16, i16* %row.i31, align 64 + %23 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i28, align 8 + %col.i32 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %23, i32 0, i32 1 + %24 = load i16, i16* %col.i32, align 2 + %25 = load i8*, i8** %base.addr.i29, align 8 + %26 = load i64, i64* %stride.addr.i30, align 8 + store i16 %22, i16* %m.addr.i58, align 2 + store i16 %24, i16* %n.addr.i59, align 2 + store i8* %25, i8** %base.addr.i60, align 8 + store i64 %26, i64* %stride.addr.i61, align 8 + %27 = load i16, i16* %m.addr.i58, align 2 + %28 = load i16, i16* %n.addr.i59, align 2 + %29 = load i8*, i8** %base.addr.i60, align 8 + %30 = load i64, i64* %stride.addr.i61, align 8 + %31 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %27, i16 %28, i8* %29, i64 %30) #2 + %32 = bitcast x86_amx %31 to <256 x i32> + %33 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i28, align 8 + %tile.i34 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %33, i32 0, i32 3 + store <256 x i32> %32, <256 x i32>* %tile.i34, align 64 + store %struct.__tile1024i_str* %c, %struct.__tile1024i_str** %dst.addr.i21, align 8 + store i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i8** %base.addr.i22, align 8 + store i64 32, i64* %stride.addr.i23, align 8 + %34 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i21, align 8 + %row.i24 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %34, i32 0, i32 0 + %35 = load i16, i16* %row.i24, align 64 + %36 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i21, align 8 + %col.i25 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %36, i32 0, i32 1 + %37 = load i16, i16* %col.i25, align 2 + %38 = load i8*, i8** %base.addr.i22, align 8 + %39 = load i64, i64* %stride.addr.i23, align 8 + store i16 %35, i16* %m.addr.i62, align 2 + store i16 %37, i16* %n.addr.i63, align 2 + store i8* %38, i8** %base.addr.i64, align 8 + store i64 %39, i64* %stride.addr.i65, align 8 + %40 = load i16, i16* %m.addr.i62, align 2 + %41 = load i16, i16* %n.addr.i63, align 2 + %42 = load i8*, i8** %base.addr.i64, align 8 + %43 = load i64, i64* %stride.addr.i65, align 8 + %44 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %40, i16 %41, i8* %42, i64 %43) #2 + %45 = bitcast x86_amx %44 to <256 x i32> + %46 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i21, align 8 + %tile.i27 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %46, i32 0, i32 3 + store <256 x i32> %45, <256 x i32>* %tile.i27, align 64 + br label %if.end + +if.else: ; preds = %entry + store %struct.__tile1024i_str* %a, %struct.__tile1024i_str** %dst.addr.i14, align 8 + store i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i8** %base.addr.i15, align 8 + store i64 32, i64* %stride.addr.i16, align 8 + %47 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i14, align 8 + %row.i17 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %47, i32 0, i32 0 + %48 = load i16, i16* %row.i17, align 64 + %49 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i14, align 8 + %col.i18 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %49, i32 0, i32 1 + %50 = load i16, i16* %col.i18, align 2 + %51 = load i8*, i8** %base.addr.i15, align 8 + %52 = load i64, i64* %stride.addr.i16, align 8 + store i16 %48, i16* %m.addr.i66, align 2 + store i16 %50, i16* %n.addr.i67, align 2 + store i8* %51, i8** %base.addr.i68, align 8 + store i64 %52, i64* %stride.addr.i69, align 8 + %53 = load i16, i16* %m.addr.i66, align 2 + %54 = load i16, i16* %n.addr.i67, align 2 + %55 = load i8*, i8** %base.addr.i68, align 8 + %56 = load i64, i64* %stride.addr.i69, align 8 + %57 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %53, i16 %54, i8* %55, i64 %56) #2 + %58 = bitcast x86_amx %57 to <256 x i32> + %59 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i14, align 8 + %tile.i20 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %59, i32 0, i32 3 + store <256 x i32> %58, <256 x i32>* %tile.i20, align 64 + store %struct.__tile1024i_str* %b, %struct.__tile1024i_str** %dst.addr.i7, align 8 + store i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i8** %base.addr.i8, align 8 + store i64 32, i64* %stride.addr.i9, align 8 + %60 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i7, align 8 + %row.i10 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %60, i32 0, i32 0 + %61 = load i16, i16* %row.i10, align 64 + %62 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i7, align 8 + %col.i11 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %62, i32 0, i32 1 + %63 = load i16, i16* %col.i11, align 2 + %64 = load i8*, i8** %base.addr.i8, align 8 + %65 = load i64, i64* %stride.addr.i9, align 8 + store i16 %61, i16* %m.addr.i70, align 2 + store i16 %63, i16* %n.addr.i71, align 2 + store i8* %64, i8** %base.addr.i72, align 8 + store i64 %65, i64* %stride.addr.i73, align 8 + %66 = load i16, i16* %m.addr.i70, align 2 + %67 = load i16, i16* %n.addr.i71, align 2 + %68 = load i8*, i8** %base.addr.i72, align 8 + %69 = load i64, i64* %stride.addr.i73, align 8 + %70 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %66, i16 %67, i8* %68, i64 %69) #2 + %71 = bitcast x86_amx %70 to <256 x i32> + %72 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i7, align 8 + %tile.i13 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %72, i32 0, i32 3 + store <256 x i32> %71, <256 x i32>* %tile.i13, align 64 + store %struct.__tile1024i_str* %c, %struct.__tile1024i_str** %dst.addr.i, align 8 + store i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i8** %base.addr.i, align 8 + store i64 32, i64* %stride.addr.i, align 8 + %73 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i, align 8 + %row.i = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %73, i32 0, i32 0 + %74 = load i16, i16* %row.i, align 64 + %75 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i, align 8 + %col.i = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %75, i32 0, i32 1 + %76 = load i16, i16* %col.i, align 2 + %77 = load i8*, i8** %base.addr.i, align 8 + %78 = load i64, i64* %stride.addr.i, align 8 + store i16 %74, i16* %m.addr.i74, align 2 + store i16 %76, i16* %n.addr.i75, align 2 + store i8* %77, i8** %base.addr.i76, align 8 + store i64 %78, i64* %stride.addr.i77, align 8 + %79 = load i16, i16* %m.addr.i74, align 2 + %80 = load i16, i16* %n.addr.i75, align 2 + %81 = load i8*, i8** %base.addr.i76, align 8 + %82 = load i64, i64* %stride.addr.i77, align 8 + %83 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %79, i16 %80, i8* %81, i64 %82) #2 + %84 = bitcast x86_amx %83 to <256 x i32> + %85 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i, align 8 + %tile.i = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %85, i32 0, i32 3 + store <256 x i32> %84, <256 x i32>* %tile.i, align 64 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %86 = bitcast %struct.__tile1024i_str* %b43 to i8* + %87 = bitcast %struct.__tile1024i_str* %b to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %86, i8* align 1 %87, i64 1088, i1 false) #2 + %88 = bitcast %struct.__tile1024i_str* %a42 to i8* + %89 = bitcast %struct.__tile1024i_str* %a to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %88, i8* align 1 %89, i64 1088, i1 false) #2 + store %struct.__tile1024i_str* %c, %struct.__tile1024i_str** %dst.addr.i44, align 8 + %row.i45 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %a42, i32 0, i32 0 + %90 = load i16, i16* %row.i45, align 64 + %col.i46 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %b43, i32 0, i32 1 + %91 = load i16, i16* %col.i46, align 2 + %col1.i = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %a42, i32 0, i32 1 + %92 = load i16, i16* %col1.i, align 2 + %93 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i44, align 8 + %tile.i47 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %93, i32 0, i32 3 + %94 = load <256 x i32>, <256 x i32>* %tile.i47, align 64 + %tile2.i = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %a42, i32 0, i32 3 + %95 = load <256 x i32>, <256 x i32>* %tile2.i, align 64 + %tile3.i = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %b43, i32 0, i32 3 + %96 = load <256 x i32>, <256 x i32>* %tile3.i, align 64 + store <256 x i32> %94, <256 x i32>* %indirect-arg-temp.i, align 1024 + store <256 x i32> %95, <256 x i32>* %indirect-arg-temp4.i, align 1024 + store <256 x i32> %96, <256 x i32>* %indirect-arg-temp5.i, align 1024 + %97 = bitcast <256 x i32>* %indirect-arg-temp5.i80 to i8* + %98 = bitcast <256 x i32>* %indirect-arg-temp5.i to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %97, i8* align 1 %98, i64 1024, i1 false) #2 + %99 = bitcast <256 x i32>* %indirect-arg-temp4.i79 to i8* + %100 = bitcast <256 x i32>* %indirect-arg-temp4.i to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %99, i8* align 1 %100, i64 1024, i1 false) #2 + %101 = bitcast <256 x i32>* %indirect-arg-temp.i78 to i8* + %102 = bitcast <256 x i32>* %indirect-arg-temp.i to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %101, i8* align 1 %102, i64 1024, i1 false) #2 + %dst.i = load <256 x i32>, <256 x i32>* %indirect-arg-temp.i78, align 1024 + %src1.i = load <256 x i32>, <256 x i32>* %indirect-arg-temp4.i79, align 1024 + %src2.i = load <256 x i32>, <256 x i32>* %indirect-arg-temp5.i80, align 1024 + store i16 %90, i16* %m.addr.i81, align 2 + store i16 %91, i16* %n.addr.i82, align 2 + store i16 %92, i16* %k.addr.i, align 2 + store <256 x i32> %dst.i, <256 x i32>* %dst.addr.i83, align 64 + store <256 x i32> %src1.i, <256 x i32>* %src1.addr.i, align 64 + store <256 x i32> %src2.i, <256 x i32>* %src2.addr.i, align 64 + %103 = load i16, i16* %m.addr.i81, align 2 + %104 = load i16, i16* %n.addr.i82, align 2 + %105 = load i16, i16* %k.addr.i, align 2 + %106 = load <256 x i32>, <256 x i32>* %dst.addr.i83, align 64 + %107 = bitcast <256 x i32> %106 to x86_amx + %108 = load <256 x i32>, <256 x i32>* %src1.addr.i, align 64 + %109 = bitcast <256 x i32> %108 to x86_amx + %110 = load <256 x i32>, <256 x i32>* %src2.addr.i, align 64 + %111 = bitcast <256 x i32> %110 to x86_amx + %112 = call x86_amx @llvm.x86.tdpbssd.internal(i16 %103, i16 %104, i16 %105, x86_amx %107, x86_amx %109, x86_amx %111) #2 + %113 = bitcast x86_amx %112 to <256 x i32> + %114 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i44, align 8 + %tile6.i = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %114, i32 0, i32 3 + store <256 x i32> %113, <256 x i32>* %tile6.i, align 64 + %115 = bitcast %struct.__tile1024i_str* %c49 to i8* + %116 = bitcast %struct.__tile1024i_str* %c to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %115, i8* align 1 %116, i64 1088, i1 false) #2 + store i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i8** %base.addr.i50, align 8 + store i64 32, i64* %stride.addr.i51, align 8 + %row.i53 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %c49, i32 0, i32 0 + %117 = load i16, i16* %row.i53, align 64 + %col.i54 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %c49, i32 0, i32 1 + %118 = load i16, i16* %col.i54, align 2 + %119 = load i8*, i8** %base.addr.i50, align 8 + %120 = load i64, i64* %stride.addr.i51, align 8 + %tile.i55 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %c49, i32 0, i32 3 + %121 = load <256 x i32>, <256 x i32>* %tile.i55, align 64 + store <256 x i32> %121, <256 x i32>* %indirect-arg-temp.i52, align 1024 + %122 = bitcast <256 x i32>* %indirect-arg-temp.i5284 to i8* + %123 = bitcast <256 x i32>* %indirect-arg-temp.i52 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %122, i8* align 1 %123, i64 1024, i1 false) #2 + %tile.i89 = load <256 x i32>, <256 x i32>* %indirect-arg-temp.i5284, align 1024 + store i16 %117, i16* %m.addr.i85, align 2 + store i16 %118, i16* %n.addr.i86, align 2 + store i8* %119, i8** %base.addr.i87, align 8 + store i64 %120, i64* %stride.addr.i88, align 8 + store <256 x i32> %tile.i89, <256 x i32>* %tile.addr.i, align 64 + %124 = load i16, i16* %m.addr.i85, align 2 + %125 = load i16, i16* %n.addr.i86, align 2 + %126 = load i8*, i8** %base.addr.i87, align 8 + %127 = load i64, i64* %stride.addr.i88, align 8 + %128 = load <256 x i32>, <256 x i32>* %tile.addr.i, align 64 + %129 = bitcast <256 x i32> %128 to x86_amx + call void @llvm.x86.tilestored64.internal(i16 %124, i16 %125, i8* %126, i64 %127, x86_amx %129) #2 + ret void +} + +; Function Attrs: argmemonly nofree nosync nounwind willreturn writeonly +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #1 + +; Function Attrs: nounwind +declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) #2 + +; Function Attrs: nounwind +declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #2 + +; Function Attrs: nounwind +declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) #2 + +; Function Attrs: argmemonly nofree nosync nounwind willreturn +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #3 + +attributes #0 = { noinline nounwind optnone uwtable } +attributes #1 = { argmemonly nofree nosync nounwind willreturn writeonly } +attributes #2 = { nounwind } +attributes #3 = { argmemonly nofree nosync nounwind willreturn } Index: llvm/test/CodeGen/X86/AMX/amx-configO2toO0-lower.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/AMX/amx-configO2toO0-lower.ll @@ -0,0 +1,78 @@ +; RUN: opt < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -lower-amx-type -S | FileCheck %s --check-prefixes=LOWER_AMX + +@buf = dso_local global [1024 x i8] zeroinitializer, align 16 +@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16 + +; Function Attrs: nounwind uwtable +define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) local_unnamed_addr { + +; LOWER_AMX-LABEL: entry: +; LOWER_AMX: %0 = alloca <256 x i32>, align 1024 +; LOWER_AMX-NEXT: %1 = bitcast <256 x i32>* %0 to i8* +; LOWER_AMX-NEXT: %2 = alloca <256 x i32>, align 1024 +; LOWER_AMX-NEXT: %3 = bitcast <256 x i32>* %2 to i8* +; LOWER_AMX-NEXT: %4 = alloca <256 x i32>, align 1024 +; LOWER_AMX-NEXT: %5 = bitcast <256 x i32>* %4 to i8* +; LOWER_AMX-NEXT: %6 = alloca <256 x i32>, align 1024 +; LOWER_AMX-NEXT: %7 = bitcast <256 x i32>* %6 to i8* +; LOWER_AMX-NEXT: %tobool.not = icmp eq i32 %cond, 0 +; LOWER_AMX-NEXT: br i1 %tobool.not, label %if.else, label %if.then +; LOWER_AMX: if.then: +; LOWER_AMX-NEXT: %8 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) +; LOWER_AMX-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %5, i64 64, x86_amx %8) +; LOWER_AMX-NEXT: %9 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) +; LOWER_AMX-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %3, i64 64, x86_amx %9) +; LOWER_AMX-NEXT: %10 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) +; LOWER_AMX-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %1, i64 64, x86_amx %10) +; LOWER_AMX-NEXT: br label %if.end +; LOWER_AMX: if.else: +; LOWER_AMX-NEXT: %11 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) +; LOWER_AMX-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %5, i64 64, x86_amx %11) +; LOWER_AMX-NEXT: %12 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) +; LOWER_AMX-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %3, i64 64, x86_amx %12) +; LOWER_AMX-NEXT: %13 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) +; LOWER_AMX-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %1, i64 64, x86_amx %13) +; LOWER_AMX-NEXT: br label %if.end +; LOWER_AMX: if.end: +; LOWER_AMX-NEXT: %14 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* %5, i64 64) +; LOWER_AMX-NEXT: %15 = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* %3, i64 64) +; LOWER_AMX-NEXT: %16 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %1, i64 64) +; LOWER_AMX-NEXT: %17 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %16, x86_amx %14, x86_amx %15) +; LOWER_AMX-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %7, i64 64, x86_amx %17) +; LOWER_AMX-NEXT: %18 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %7, i64 64) +; LOWER_AMX-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %18) +; LOWER_AMX-NEXT: ret void + +entry: + %tobool.not = icmp eq i32 %cond, 0 + br i1 %tobool.not, label %if.else, label %if.then + +if.then: ; preds = %entry + %0 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) + %1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) + %2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) + br label %if.end + +if.else: ; preds = %entry + %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) + %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) + %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) + br label %if.end + +if.end: ; preds = %if.else, %if.then + %a.sroa.1094.0.in = phi x86_amx [ %3, %if.else ], [ %0, %if.then ] + %b.sroa.1069.0.in = phi x86_amx [ %4, %if.else ], [ %1, %if.then ] + %c.sroa.1044.0.in = phi x86_amx [ %5, %if.else ], [ %2, %if.then ] + %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %c.sroa.1044.0.in, x86_amx %a.sroa.1094.0.in, x86_amx %b.sroa.1069.0.in) + tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %6) + ret void +} + +; Function Attrs: nounwind +declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) + +; Function Attrs: nounwind +declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) + +; Function Attrs: nounwind +declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) Index: llvm/test/CodeGen/X86/AMX/amx-configO2toO0-precfg.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/AMX/amx-configO2toO0-precfg.ll @@ -0,0 +1,192 @@ +; RUN: opt < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -pre-amx-config -S | FileCheck %s --check-prefixes=PRECFG_AMX + +@buf = dso_local global [1024 x i8] zeroinitializer, align 16 +@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16 + +; Function Attrs: nounwind uwtable +define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) local_unnamed_addr { +; PRECFG_AMX_LABEL: entry: +; PRECFG_AMX: %0 = alloca <16 x i32>, align 4 +; PRECFG_AMX_NEXT: %1 = alloca <16 x i32>, align 4 +; PRECFG_AMX_NEXT: %2 = alloca <16 x i32>, align 4 +; PRECFG_AMX_NEXT: %3 = alloca <16 x i32>, align 4 +; PRECFG_AMX_NEXT: %4 = alloca <16 x i32>, align 4 +; PRECFG_AMX_NEXT: %5 = alloca <16 x i32>, align 4 +; PRECFG_AMX_NEXT: %6 = alloca <16 x i32>, align 4 +; PRECFG_AMX_NEXT: %7 = alloca <16 x i32>, align 4 +; PRECFG_AMX_NEXT: %8 = alloca <256 x i32>, align 1024 +; PRECFG_AMX_NEXT: %9 = bitcast <256 x i32>* %8 to i8* +; PRECFG_AMX_NEXT: %10 = alloca <256 x i32>, align 1024 +; PRECFG_AMX_NEXT: %11 = bitcast <256 x i32>* %10 to i8* +; PRECFG_AMX_NEXT: %12 = alloca <256 x i32>, align 1024 +; PRECFG_AMX_NEXT: %13 = bitcast <256 x i32>* %12 to i8* +; PRECFG_AMX_NEXT: %14 = alloca <256 x i32>, align 1024 +; PRECFG_AMX_NEXT: %15 = bitcast <256 x i32>* %14 to i8* +; PRECFG_AMX_NEXT: %tobool.not = icmp eq i32 %cond, 0 +; PRECFG_AMX_NEXT: br i1 %tobool.not, label %if.else, label %if.then +; PRECFG_AMX: if.then: +; PRECFG_AMX_NEXT: %16 = bitcast <16 x i32>* %3 to i8* +; PRECFG_AMX_NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %3, align 64 +; PRECFG_AMX_NEXT: %amx.tmm.0.shape.row7 = getelementptr i8, i8* %16, i64 48 +; PRECFG_AMX_NEXT: %17 = getelementptr i8, i8* %16, i64 16 +; PRECFG_AMX_NEXT: %amx.tmm.0.shape.col8 = bitcast i8* %17 to i16* +; PRECFG_AMX_NEXT: %18 = trunc i16 %row to i8 +; PRECFG_AMX_NEXT: store volatile i8 %18, i8* %amx.tmm.0.shape.row7, align 1 +; PRECFG_AMX_NEXT: store volatile i16 8, i16* %amx.tmm.0.shape.col8, align 2 +; PRECFG_AMX_NEXT: call void @llvm.x86.ldtilecfg(i8* %16) +; PRECFG_AMX_NEXT: %19 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) +; PRECFG_AMX_NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %13, i64 64, x86_amx %19) +; PRECFG_AMX_NEXT: %20 = bitcast <16 x i32>* %6 to i8* +; PRECFG_AMX_NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %6, align 64 +; PRECFG_AMX_NEXT: %amx.tmm.0.shape.row1 = getelementptr i8, i8* %20, i64 48 +; PRECFG_AMX_NEXT: %21 = getelementptr i8, i8* %20, i64 16 +; PRECFG_AMX_NEXT: %amx.tmm.0.shape.col2 = bitcast i8* %21 to i16* +; PRECFG_AMX_NEXT: %22 = trunc i16 8 to i8 +; PRECFG_AMX_NEXT: store volatile i8 %22, i8* %amx.tmm.0.shape.row1, align 1 +; PRECFG_AMX_NEXT: store volatile i16 %col, i16* %amx.tmm.0.shape.col2, align 2 +; PRECFG_AMX_NEXT: call void @llvm.x86.ldtilecfg(i8* %20) +; PRECFG_AMX_NEXT: %23 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) +; PRECFG_AMX_NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %11, i64 64, x86_amx %23) +; PRECFG_AMX_NEXT: %24 = bitcast <16 x i32>* %4 to i8* +; PRECFG_AMX_NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %4, align 64 +; PRECFG_AMX_NEXT: %amx.tmm.0.shape.row5 = getelementptr i8, i8* %24, i64 48 +; PRECFG_AMX_NEXT: %25 = getelementptr i8, i8* %24, i64 16 +; PRECFG_AMX_NEXT: %amx.tmm.0.shape.col6 = bitcast i8* %25 to i16* +; PRECFG_AMX_NEXT: %26 = trunc i16 %row to i8 +; PRECFG_AMX_NEXT: store volatile i8 %26, i8* %amx.tmm.0.shape.row5, align 1 +; PRECFG_AMX_NEXT: store volatile i16 %col, i16* %amx.tmm.0.shape.col6, align 2 +; PRECFG_AMX_NEXT: call void @llvm.x86.ldtilecfg(i8* %24) +; PRECFG_AMX_NEXT: %27 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) +; PRECFG_AMX_NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %9, i64 64, x86_amx %27) +; PRECFG_AMX_NEXT: br label %if.end +; PRECFG_AMX: if.else: +; PRECFG_AMX_NEXT: %28 = bitcast <16 x i32>* %5 to i8* +; PRECFG_AMX_NEXT: %28 = bitcast <16 x i32>* %5 to i8* +; PRECFG_AMX_NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %5, align 64 +; PRECFG_AMX_NEXT: %amx.tmm.0.shape.row3 = getelementptr i8, i8* %28, i64 48 +; PRECFG_AMX_NEXT: %29 = getelementptr i8, i8* %28, i64 16 +; PRECFG_AMX_NEXT: %amx.tmm.0.shape.col4 = bitcast i8* %29 to i16* +; PRECFG_AMX_NEXT: %30 = trunc i16 %row to i8 +; PRECFG_AMX_NEXT: store volatile i8 %30, i8* %amx.tmm.0.shape.row3, align 1 +; PRECFG_AMX_NEXT: store volatile i16 8, i16* %amx.tmm.0.shape.col4, align 2 +; PRECFG_AMX_NEXT: call void @llvm.x86.ldtilecfg(i8* %28) +; PRECFG_AMX_NEXT: %31 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) +; PRECFG_AMX_NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %13, i64 64, x86_amx %31) +; PRECFG_AMX_NEXT: %32 = bitcast <16 x i32>* %2 to i8* +; PRECFG_AMX_NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %2, align 64 +; PRECFG_AMX_NEXT: %amx.tmm.0.shape.row9 = getelementptr i8, i8* %32, i64 48 +; PRECFG_AMX_NEXT: %33 = getelementptr i8, i8* %32, i64 16 +; PRECFG_AMX_NEXT: %amx.tmm.0.shape.col10 = bitcast i8* %33 to i16* +; PRECFG_AMX_NEXT: %34 = trunc i16 8 to i8 +; PRECFG_AMX_NEXT: store volatile i8 %34, i8* %amx.tmm.0.shape.row9, align 1 +; PRECFG_AMX_NEXT: store volatile i16 %col, i16* %amx.tmm.0.shape.col10, align 2 +; PRECFG_AMX_NEXT: call void @llvm.x86.ldtilecfg(i8* %32) +; PRECFG_AMX_NEXT: %35 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) +; PRECFG_AMX_NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %11, i64 64, x86_amx %35) +; PRECFG_AMX_NEXT: %36 = bitcast <16 x i32>* %1 to i8* +; PRECFG_AMX_NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %1, align 64 +; PRECFG_AMX_NEXT: %amx.tmm.0.shape.row11 = getelementptr i8, i8* %36, i64 48 +; PRECFG_AMX_NEXT: %37 = getelementptr i8, i8* %36, i64 16 +; PRECFG_AMX_NEXT: %amx.tmm.0.shape.col12 = bitcast i8* %37 to i16* +; PRECFG_AMX_NEXT: %38 = trunc i16 %row to i8 +; PRECFG_AMX_NEXT: store volatile i8 %38, i8* %amx.tmm.0.shape.row11, align 1 +; PRECFG_AMX_NEXT: store volatile i16 %col, i16* %amx.tmm.0.shape.col12, align 2 +; PRECFG_AMX_NEXT: call void @llvm.x86.ldtilecfg(i8* %36) +; PRECFG_AMX_NEXT: %39 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) +; PRECFG_AMX_NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %9, i64 64, x86_amx %39) +; PRECFG_AMX_NEXT: br label %if.end +; PRECFG_AMX_NEXT: if.end: +; PRECFG_AMX_NEXT: %40 = bitcast <16 x i32>* %7 to i8* +; PRECFG_AMX_NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %7, align 64 +; PRECFG_AMX_NEXT: %amx.tmm.0.shape.row = getelementptr i8, i8* %40, i64 48 +; PRECFG_AMX_NEXT: %41 = getelementptr i8, i8* %40, i64 16 +; PRECFG_AMX_NEXT: %amx.tmm.0.shape.col = bitcast i8* %41 to i16* +; PRECFG_AMX_NEXT: %42 = trunc i16 %row to i8 +; PRECFG_AMX_NEXT: store volatile i8 %42, i8* %amx.tmm.0.shape.row, align 1 +; PRECFG_AMX_NEXT: store volatile i16 %col, i16* %amx.tmm.0.shape.col, align 2 +; PRECFG_AMX_NEXT: %amx.tmm.1.shape.row = getelementptr i8, i8* %40, i64 49 +; PRECFG_AMX_NEXT: %43 = getelementptr i8, i8* %40, i64 18 +; PRECFG_AMX_NEXT: %amx.tmm.1.shape.col = bitcast i8* %43 to i16* +; PRECFG_AMX_NEXT: %44 = trunc i16 %row to i8 +; PRECFG_AMX_NEXT: store volatile i8 %44, i8* %amx.tmm.1.shape.row, align 1 +; PRECFG_AMX_NEXT: store volatile i16 8, i16* %amx.tmm.1.shape.col, align 2 +; PRECFG_AMX_NEXT: %amx.tmm.2.shape.row = getelementptr i8, i8* %40, i64 50 +; PRECFG_AMX_NEXT: %45 = getelementptr i8, i8* %40, i64 20 +; PRECFG_AMX_NEXT: %amx.tmm.2.shape.col = bitcast i8* %45 to i16* +; PRECFG_AMX_NEXT: %46 = trunc i16 8 to i8 +; PRECFG_AMX_NEXT: store volatile i8 %46, i8* %amx.tmm.2.shape.row, align 1 +; PRECFG_AMX_NEXT: store volatile i16 %col, i16* %amx.tmm.2.shape.col, align 2 +; PRECFG_AMX_NEXT: %amx.tmm.3.shape.row = getelementptr i8, i8* %40, i64 51 +; PRECFG_AMX_NEXT: %47 = getelementptr i8, i8* %40, i64 22 +; PRECFG_AMX_NEXT: %amx.tmm.3.shape.col = bitcast i8* %47 to i16* +; PRECFG_AMX_NEXT: %48 = trunc i16 %row to i8 +; PRECFG_AMX_NEXT: store volatile i8 %48, i8* %amx.tmm.3.shape.row, align 1 +; PRECFG_AMX_NEXT: store volatile i16 %col, i16* %amx.tmm.3.shape.col, align 2 +; PRECFG_AMX_NEXT: call void @llvm.x86.ldtilecfg(i8* %40) +; PRECFG_AMX_NEXT: %49 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* %13, i64 64) +; PRECFG_AMX_NEXT: %50 = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* %11, i64 64) +; PRECFG_AMX_NEXT: %51 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %9, i64 64) +; PRECFG_AMX_NEXT: %52 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %51, x86_amx %49, x86_amx %50) +; PRECFG_AMX_NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %15, i64 64, x86_amx %52) +; PRECFG_AMX_NEXT: %53 = bitcast <16 x i32>* %0 to i8* +; PRECFG_AMX_NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %0, align 64 +; PRECFG_AMX_NEXT: %amx.tmm.0.shape.row13 = getelementptr i8, i8* %53, i64 48 +; PRECFG_AMX_NEXT: %54 = getelementptr i8, i8* %53, i64 16 +; PRECFG_AMX_NEXT: %amx.tmm.0.shape.col14 = bitcast i8* %54 to i16* +; PRECFG_AMX_NEXT: %55 = trunc i16 %row to i8 +; PRECFG_AMX_NEXT: store volatile i8 %55, i8* %amx.tmm.0.shape.row13, align 1 +; PRECFG_AMX_NEXT: store volatile i16 %col, i16* %amx.tmm.0.shape.col14, align 2 +; PRECFG_AMX_NEXT: call void @llvm.x86.ldtilecfg(i8* %53) +; PRECFG_AMX_NEXT: %56 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %15, i64 64) +; PRECFG_AMX_NEXT: tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %56) +; PRECFG_AMX_NEXT: ret void + +entry: + %0 = alloca <256 x i32>, align 1024 + %1 = bitcast <256 x i32>* %0 to i8* + %2 = alloca <256 x i32>, align 1024 + %3 = bitcast <256 x i32>* %2 to i8* + %4 = alloca <256 x i32>, align 1024 + %5 = bitcast <256 x i32>* %4 to i8* + %6 = alloca <256 x i32>, align 1024 + %7 = bitcast <256 x i32>* %6 to i8* + %tobool.not = icmp eq i32 %cond, 0 + br i1 %tobool.not, label %if.else, label %if.then + +if.then: ; preds = %entry + %8 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) + call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %5, i64 64, x86_amx %8) + %9 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) + call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %3, i64 64, x86_amx %9) + %10 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) + call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %1, i64 64, x86_amx %10) + br label %if.end + +if.else: ; preds = %entry + %11 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) + call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %5, i64 64, x86_amx %11) + %12 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) + call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %3, i64 64, x86_amx %12) + %13 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) + call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %1, i64 64, x86_amx %13) + br label %if.end + +if.end: ; preds = %if.else, %if.then + %14 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* %5, i64 64) + %15 = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* %3, i64 64) + %16 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %1, i64 64) + %17 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %16, x86_amx %14, x86_amx %15) + call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %7, i64 64, x86_amx %17) + %18 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %7, i64 64) + tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %18) + ret void +} + +; Function Attrs: nounwind +declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) + +; Function Attrs: nounwind +declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) + +; Function Attrs: nounwind +declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) Index: llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll @@ -0,0 +1,178 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f | FileCheck %s --check-prefix=AMX_O0 + +@buf = dso_local global [1024 x i8] zeroinitializer, align 16 +@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16 + +; Function Attrs: nounwind uwtable +define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) local_unnamed_addr { +; AMX_O0-LABEL: test_api: +; AMX_O0: # %bb.0: # %entry +; AMX_O0-NEXT: pushq %rbp +; AMX_O0-NEXT: .cfi_def_cfa_offset 16 +; AMX_O0-NEXT: .cfi_offset %rbp, -16 +; AMX_O0-NEXT: movq %rsp, %rbp +; AMX_O0-NEXT: .cfi_def_cfa_register %rbp +; AMX_O0-NEXT: andq $-1024, %rsp # imm = 0xFC00 +; AMX_O0-NEXT: subq $6144, %rsp # imm = 0x1800 +; AMX_O0-NEXT: movw %dx, %ax +; AMX_O0-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AMX_O0-NEXT: movw %si, %ax +; AMX_O0-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rax +; AMX_O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rax +; AMX_O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rax +; AMX_O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rax +; AMX_O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AMX_O0-NEXT: cmpl $0, %edi +; AMX_O0-NEXT: je .LBB0_2 +; AMX_O0-NEXT: # %bb.1: # %if.then +; AMX_O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload +; AMX_O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload +; AMX_O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AMX_O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; AMX_O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; AMX_O0-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movb %al, %sil +; AMX_O0-NEXT: movb %sil, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw $8, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movl $buf, %r9d +; AMX_O0-NEXT: movl $32, %r10d +; AMX_O0-NEXT: movw $8, %si +; AMX_O0-NEXT: tileloadd (%r9,%r10), %tmm0 +; AMX_O0-NEXT: movl $64, %r8d +; AMX_O0-NEXT: tilestored %tmm0, (%r11,%r8) +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movb $8, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: tileloadd (%r9,%r10), %tmm0 +; AMX_O0-NEXT: tilestored %tmm0, (%rdi,%r8) +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movb %al, %dil +; AMX_O0-NEXT: movb %dil, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: ldtilecfg (%rsi) +; AMX_O0-NEXT: movl $buf, %esi +; AMX_O0-NEXT: movl $32, %edi +; AMX_O0-NEXT: tileloadd (%rsi,%rdi), %tmm0 +; AMX_O0-NEXT: movl $64, %esi +; AMX_O0-NEXT: tilestored %tmm0, (%rdx,%rsi) +; AMX_O0-NEXT: jmp .LBB0_3 +; AMX_O0-NEXT: .LBB0_2: # %if.else +; AMX_O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload +; AMX_O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload +; AMX_O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AMX_O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; AMX_O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; AMX_O0-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movb %al, %sil +; AMX_O0-NEXT: movb %sil, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw $8, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movl $buf2, %r9d +; AMX_O0-NEXT: movl $32, %r10d +; AMX_O0-NEXT: movw $8, %si +; AMX_O0-NEXT: tileloadd (%r9,%r10), %tmm0 +; AMX_O0-NEXT: movl $64, %r8d +; AMX_O0-NEXT: tilestored %tmm0, (%r11,%r8) +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movb $8, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: tileloadd (%r9,%r10), %tmm0 +; AMX_O0-NEXT: tilestored %tmm0, (%rdi,%r8) +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movb %al, %dil +; AMX_O0-NEXT: movb %dil, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: ldtilecfg (%rsi) +; AMX_O0-NEXT: movl $buf2, %esi +; AMX_O0-NEXT: movl $32, %edi +; AMX_O0-NEXT: tileloadd (%rsi,%rdi), %tmm0 +; AMX_O0-NEXT: movl $64, %esi +; AMX_O0-NEXT: tilestored %tmm0, (%rdx,%rsi) +; AMX_O0-NEXT: .LBB0_3: # %if.end +; AMX_O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload +; AMX_O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload +; AMX_O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AMX_O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AMX_O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; AMX_O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AMX_O0-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movb %al, %sil +; AMX_O0-NEXT: movb %sil, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movb %sil, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw $8, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movb $8, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movb %sil, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movl $64, %esi +; AMX_O0-NEXT: movw $8, %di +; AMX_O0-NEXT: tileloadd (%r10,%rsi), %tmm1 +; AMX_O0-NEXT: tileloadd (%r9,%rsi), %tmm2 +; AMX_O0-NEXT: tileloadd (%r8,%rsi), %tmm0 +; AMX_O0-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 +; AMX_O0-NEXT: tilestored %tmm0, (%rdx,%rsi) +; AMX_O0-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; AMX_O0-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movb %al, %dil +; AMX_O0-NEXT: movb %dil, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AMX_O0-NEXT: ldtilecfg (%rsi) +; AMX_O0-NEXT: movl $64, %esi +; AMX_O0-NEXT: tileloadd (%rdx,%rsi), %tmm0 +; AMX_O0-NEXT: movl $buf, %edx +; AMX_O0-NEXT: movl $32, %esi +; AMX_O0-NEXT: tilestored %tmm0, (%rdx,%rsi) +; AMX_O0-NEXT: movq %rbp, %rsp +; AMX_O0-NEXT: popq %rbp +; AMX_O0-NEXT: .cfi_def_cfa %rsp, 8 +; AMX_O0-NEXT: tilerelease +; AMX_O0-NEXT: vzeroupper +; AMX_O0-NEXT: retq +entry: + %tobool.not = icmp eq i32 %cond, 0 + br i1 %tobool.not, label %if.else, label %if.then + +if.then: ; preds = %entry + %0 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) + %1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) + %2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) + br label %if.end + +if.else: ; preds = %entry + %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) + %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) + %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) + br label %if.end + +if.end: ; preds = %if.else, %if.then + %a.sroa.1094.0.in = phi x86_amx [ %3, %if.else ], [ %0, %if.then ] + %b.sroa.1069.0.in = phi x86_amx [ %4, %if.else ], [ %1, %if.then ] + %c.sroa.1044.0.in = phi x86_amx [ %5, %if.else ], [ %2, %if.then ] + %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %c.sroa.1044.0.in, x86_amx %a.sroa.1094.0.in, x86_amx %b.sroa.1069.0.in) + tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %6) + ret void +} + +; Function Attrs: nounwind +declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) + +; Function Attrs: nounwind +declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) + +; Function Attrs: nounwind +declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) Index: llvm/test/CodeGen/X86/AMX/amx-fast-tile-config.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/AMX/amx-fast-tile-config.mir @@ -0,0 +1,465 @@ +# RUN: llc -o - -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -run-pass=fasttileconfig %s | FileCheck %s + +--- | + + @buf = dso_local global [1024 x i8] zeroinitializer, align 16 + @buf2 = dso_local global [1024 x i8] zeroinitializer, align 16 + + define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) local_unnamed_addr #0 { + entry: + %0 = alloca <16 x i32>, align 4 + %1 = alloca <16 x i32>, align 4 + %2 = alloca <16 x i32>, align 4 + %3 = alloca <16 x i32>, align 4 + %4 = alloca <16 x i32>, align 4 + %5 = alloca <16 x i32>, align 4 + %6 = alloca <16 x i32>, align 4 + %7 = alloca <16 x i32>, align 4 + %8 = alloca <256 x i32>, align 1024 + %9 = bitcast <256 x i32>* %8 to i8* + %10 = alloca <256 x i32>, align 1024 + %11 = bitcast <256 x i32>* %10 to i8* + %12 = alloca <256 x i32>, align 1024 + %13 = bitcast <256 x i32>* %12 to i8* + %14 = alloca <256 x i32>, align 1024 + %15 = bitcast <256 x i32>* %14 to i8* + %tobool.not = icmp eq i32 %cond, 0 + br i1 %tobool.not, label %if.else, label %if.then + + if.then: ; preds = %entry + %16 = bitcast <16 x i32>* %6 to i8* + store <16 x i32> zeroinitializer, <16 x i32>* %6, align 64 + %amx.tmm.0.shape.row1 = getelementptr i8, i8* %16, i64 48 + %17 = getelementptr i8, i8* %16, i64 16 + %amx.tmm.0.shape.col2 = bitcast i8* %17 to i16* + %18 = trunc i16 %row to i8 + store volatile i8 %18, i8* %amx.tmm.0.shape.row1, align 1 + store volatile i16 8, i16* %amx.tmm.0.shape.col2, align 2 + call void @llvm.x86.ldtilecfg(i8* %16) + %19 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) + call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %13, i64 64, x86_amx %19) + %20 = bitcast <16 x i32>* %2 to i8* + store <16 x i32> zeroinitializer, <16 x i32>* %2, align 64 + %amx.tmm.0.shape.row9 = getelementptr i8, i8* %20, i64 48 + %21 = getelementptr i8, i8* %20, i64 16 + %amx.tmm.0.shape.col10 = bitcast i8* %21 to i16* + %22 = trunc i16 8 to i8 + store volatile i8 %22, i8* %amx.tmm.0.shape.row9, align 1 + store volatile i16 %col, i16* %amx.tmm.0.shape.col10, align 2 + call void @llvm.x86.ldtilecfg(i8* %20) + %23 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) + call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %11, i64 64, x86_amx %23) + %24 = bitcast <16 x i32>* %3 to i8* + store <16 x i32> zeroinitializer, <16 x i32>* %3, align 64 + %amx.tmm.0.shape.row7 = getelementptr i8, i8* %24, i64 48 + %25 = getelementptr i8, i8* %24, i64 16 + %amx.tmm.0.shape.col8 = bitcast i8* %25 to i16* + %26 = trunc i16 %row to i8 + store volatile i8 %26, i8* %amx.tmm.0.shape.row7, align 1 + store volatile i16 %col, i16* %amx.tmm.0.shape.col8, align 2 + call void @llvm.x86.ldtilecfg(i8* %24) + %27 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) + call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %9, i64 64, x86_amx %27) + br label %if.end + + if.else: ; preds = %entry + %28 = bitcast <16 x i32>* %1 to i8* + store <16 x i32> zeroinitializer, <16 x i32>* %1, align 64 + %amx.tmm.0.shape.row11 = getelementptr i8, i8* %28, i64 48 + %29 = getelementptr i8, i8* %28, i64 16 + %amx.tmm.0.shape.col12 = bitcast i8* %29 to i16* + %30 = trunc i16 %row to i8 + store volatile i8 %30, i8* %amx.tmm.0.shape.row11, align 1 + store volatile i16 8, i16* %amx.tmm.0.shape.col12, align 2 + call void @llvm.x86.ldtilecfg(i8* %28) + %31 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) + call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %13, i64 64, x86_amx %31) + %32 = bitcast <16 x i32>* %7 to i8* + store <16 x i32> zeroinitializer, <16 x i32>* %7, align 64 + %amx.tmm.0.shape.row = getelementptr i8, i8* %32, i64 48 + %33 = getelementptr i8, i8* %32, i64 16 + %amx.tmm.0.shape.col = bitcast i8* %33 to i16* + %34 = trunc i16 8 to i8 + store volatile i8 %34, i8* %amx.tmm.0.shape.row, align 1 + store volatile i16 %col, i16* %amx.tmm.0.shape.col, align 2 + call void @llvm.x86.ldtilecfg(i8* %32) + %35 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) + call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %11, i64 64, x86_amx %35) + %36 = bitcast <16 x i32>* %0 to i8* + store <16 x i32> zeroinitializer, <16 x i32>* %0, align 64 + %amx.tmm.0.shape.row13 = getelementptr i8, i8* %36, i64 48 + %37 = getelementptr i8, i8* %36, i64 16 + %amx.tmm.0.shape.col14 = bitcast i8* %37 to i16* + %38 = trunc i16 %row to i8 + store volatile i8 %38, i8* %amx.tmm.0.shape.row13, align 1 + store volatile i16 %col, i16* %amx.tmm.0.shape.col14, align 2 + call void @llvm.x86.ldtilecfg(i8* %36) + %39 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) + call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %9, i64 64, x86_amx %39) + br label %if.end + + if.end: ; preds = %if.else, %if.then + %40 = bitcast <16 x i32>* %4 to i8* + store <16 x i32> zeroinitializer, <16 x i32>* %4, align 64 + %amx.tmm.0.shape.row5 = getelementptr i8, i8* %40, i64 48 + %41 = getelementptr i8, i8* %40, i64 16 + %amx.tmm.0.shape.col6 = bitcast i8* %41 to i16* + %42 = trunc i16 %row to i8 + store volatile i8 %42, i8* %amx.tmm.0.shape.row5, align 1 + store volatile i16 %col, i16* %amx.tmm.0.shape.col6, align 2 + %amx.tmm.1.shape.row = getelementptr i8, i8* %40, i64 49 + %43 = getelementptr i8, i8* %40, i64 18 + %amx.tmm.1.shape.col = bitcast i8* %43 to i16* + %44 = trunc i16 %row to i8 + store volatile i8 %44, i8* %amx.tmm.1.shape.row, align 1 + store volatile i16 8, i16* %amx.tmm.1.shape.col, align 2 + %amx.tmm.2.shape.row = getelementptr i8, i8* %40, i64 50 + %45 = getelementptr i8, i8* %40, i64 20 + %amx.tmm.2.shape.col = bitcast i8* %45 to i16* + %46 = trunc i16 8 to i8 + store volatile i8 %46, i8* %amx.tmm.2.shape.row, align 1 + store volatile i16 %col, i16* %amx.tmm.2.shape.col, align 2 + %amx.tmm.3.shape.row = getelementptr i8, i8* %40, i64 51 + %47 = getelementptr i8, i8* %40, i64 22 + %amx.tmm.3.shape.col = bitcast i8* %47 to i16* + %48 = trunc i16 %row to i8 + store volatile i8 %48, i8* %amx.tmm.3.shape.row, align 1 + store volatile i16 %col, i16* %amx.tmm.3.shape.col, align 2 + call void @llvm.x86.ldtilecfg(i8* %40) + %49 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* %13, i64 64) + %50 = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* %11, i64 64) + %51 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %9, i64 64) + %52 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %51, x86_amx %49, x86_amx %50) + call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %15, i64 64, x86_amx %52) + %53 = bitcast <16 x i32>* %5 to i8* + store <16 x i32> zeroinitializer, <16 x i32>* %5, align 64 + %amx.tmm.0.shape.row3 = getelementptr i8, i8* %53, i64 48 + %54 = getelementptr i8, i8* %53, i64 16 + %amx.tmm.0.shape.col4 = bitcast i8* %54 to i16* + %55 = trunc i16 %row to i8 + store volatile i8 %55, i8* %amx.tmm.0.shape.row3, align 1 + store volatile i16 %col, i16* %amx.tmm.0.shape.col4, align 2 + call void @llvm.x86.ldtilecfg(i8* %53) + %56 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %15, i64 64) + tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %56) + ret void + } + + ; Function Attrs: nounwind + declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) #1 + + ; Function Attrs: nounwind + declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #1 + + ; Function Attrs: nounwind + declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) #1 + + ; Function Attrs: nounwind + declare void @llvm.x86.ldtilecfg(i8*) #2 + + attributes #0 = { "target-features"="+amx-int8,+avx512f" } + attributes #1 = { nounwind "target-features"="+amx-int8,+avx512f" } + attributes #2 = { nounwind } + +... +--- +name: test_api +alignment: 16 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$edi', virtual-reg: '' } + - { reg: '$esi', virtual-reg: '' } + - { reg: '$edx', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1024 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: default, offset: 0, size: 64, alignment: 16, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: default, offset: 0, size: 64, alignment: 16, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: default, offset: 0, size: 64, alignment: 16, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 3, name: '', type: default, offset: 0, size: 64, alignment: 16, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 4, name: '', type: default, offset: 0, size: 64, alignment: 16, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 5, name: '', type: default, offset: 0, size: 64, alignment: 16, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 6, name: '', type: default, offset: 0, size: 64, alignment: 16, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 7, name: '', type: default, offset: 0, size: 64, alignment: 16, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 8, name: '', type: default, offset: 0, size: 1024, alignment: 1024, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 9, name: '', type: default, offset: 0, size: 1024, alignment: 1024, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 10, name: '', type: default, offset: 0, size: 1024, alignment: 1024, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 11, name: '', type: default, offset: 0, size: 1024, alignment: 1024, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 12, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 13, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 14, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 15, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 16, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 17, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + successors: %bb.2(0x40000000), %bb.1(0x40000000) + liveins: $edi, $esi, $edx + + renamable $ax = COPY renamable $dx, implicit killed $edx + MOV16mr %stack.17, 1, $noreg, 0, $noreg, killed $ax :: (store 2 into %stack.17) + renamable $ax = COPY renamable $si, implicit killed $esi + MOV16mr %stack.16, 1, $noreg, 0, $noreg, killed $ax :: (store 2 into %stack.16) + renamable $rax = LEA64r %stack.8, 1, $noreg, 0, $noreg + MOV64mr %stack.15, 1, $noreg, 0, $noreg, killed $rax :: (store 8 into %stack.15) + renamable $rax = LEA64r %stack.9, 1, $noreg, 0, $noreg + MOV64mr %stack.14, 1, $noreg, 0, $noreg, killed $rax :: (store 8 into %stack.14) + renamable $rax = LEA64r %stack.10, 1, $noreg, 0, $noreg + MOV64mr %stack.13, 1, $noreg, 0, $noreg, killed $rax :: (store 8 into %stack.13) + renamable $rax = LEA64r %stack.11, 1, $noreg, 0, $noreg + MOV64mr %stack.12, 1, $noreg, 0, $noreg, killed $rax :: (store 8 into %stack.12) + CMP32ri8 killed renamable $edi, 0, implicit-def $eflags + JCC_1 %bb.2, 4, implicit killed $eflags + + bb.1.if.then: + successors: %bb.3(0x80000000) + ; CHECK-LABEL: bb.1.if.then + ; tmm0 --> row_offset = 48, col_offset = 16 + ; CHECK: MOV8mr %stack.6, 1, $noreg, 48, $noreg, killed renamable $sil :: (volatile store 1 into %ir.amx.tmm.0.shape.row1) + ; CHECK: MOV16mi %stack.6, 1, $noreg, 16, $noreg, 8 :: (volatile store 2 into %ir.amx.tmm.0.shape.col2) + ; CHECK: LDTILECFG %stack.6, 1, $noreg, 0, $noreg + ; CHECK: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $si, renamable $r9, 1, renamable $r10, 0, $noreg + ; CHECK: PTILESTOREDV renamable $ax, renamable $si, renamable $r11, 1, renamable $r8, 0, $noreg, killed renamable $tmm0 + + ; tmm1 --> row_offset = 49, col_offset = 18 + ; CHECK: MOV8mi %stack.2, 1, $noreg, 49, $noreg, 8 :: (volatile store 1 into %ir.amx.tmm.0.shape.row9) + ; CHECK: MOV16mr %stack.2, 1, $noreg, 18, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col10) + ; CHECK: LDTILECFG %stack.2, 1, $noreg, 0, $noreg + ; CHECK: renamable $tmm1 = PTILELOADDV renamable $si, renamable $cx, killed renamable $r9, 1, killed renamable $r10, 0, $noreg + ; CHECK: PTILESTOREDV killed renamable $si, renamable $cx, renamable $rdi, 1, killed renamable $r8, 0, $noreg, killed renamable $tmm1 + + ; tmm2 --> row_offset = 50, col_offset = 20 + ; CHECK: MOV8mr %stack.3, 1, $noreg, 50, $noreg, killed renamable $dil :: (volatile store 1 into %ir.amx.tmm.0.shape.row7) + ; CHECK: MOV16mr %stack.3, 1, $noreg, 20, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col8) + ; CHECK: LDTILECFG killed renamable $rsi, 1, $noreg, 0, $noreg + ; CHECK: renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rsi, 1, killed renamable $rdi, 0, $noreg + ; CHECK: PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm2 + + $ax = MOV16rm %stack.16, 1, $noreg, 0, $noreg :: (load 2 from %stack.16) + $cx = MOV16rm %stack.17, 1, $noreg, 0, $noreg :: (load 2 from %stack.17) + $rdx = MOV64rm %stack.15, 1, $noreg, 0, $noreg :: (load 8 from %stack.15) + $rdi = MOV64rm %stack.14, 1, $noreg, 0, $noreg :: (load 8 from %stack.14) + $r11 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load 8 from %stack.13) + renamable $zmm0 = AVX512_512_SET0 + VMOVDQA64Zmr %stack.6, 1, $noreg, 0, $noreg, renamable $zmm0 :: (store 64 into %ir.6) + renamable $sil = COPY renamable $al + MOV8mr %stack.6, 1, $noreg, 48, $noreg, killed renamable $sil :: (volatile store 1 into %ir.amx.tmm.0.shape.row1) + MOV16mi %stack.6, 1, $noreg, 16, $noreg, 8 :: (volatile store 2 into %ir.amx.tmm.0.shape.col2) + LDTILECFG %stack.6, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 + renamable $r9 = MOV32ri64 @buf + renamable $r10 = MOV32ri64 32 + renamable $si = MOV16ri 8 + renamable $tmm0 = PTILELOADDV renamable $ax, renamable $si, renamable $r9, 1, renamable $r10, 0, $noreg + renamable $r8 = MOV32ri64 64 + PTILESTOREDV renamable $ax, renamable $si, renamable $r11, 1, renamable $r8, 0, $noreg, killed renamable $tmm0 + VMOVDQA64Zmr %stack.2, 1, $noreg, 0, $noreg, renamable $zmm0 :: (store 64 into %ir.2) + MOV8mi %stack.2, 1, $noreg, 48, $noreg, 8 :: (volatile store 1 into %ir.amx.tmm.0.shape.row9) + MOV16mr %stack.2, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col10) + LDTILECFG %stack.2, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 + renamable $tmm1 = PTILELOADDV renamable $si, renamable $cx, killed renamable $r9, 1, killed renamable $r10, 0, $noreg + PTILESTOREDV killed renamable $si, renamable $cx, renamable $rdi, 1, killed renamable $r8, 0, $noreg, killed renamable $tmm1 + renamable $rsi = LEA64r %stack.3, 1, $noreg, 0, $noreg + VMOVDQA64Zmr %stack.3, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store 64 into %ir.3) + renamable $dil = COPY renamable $al + MOV8mr %stack.3, 1, $noreg, 48, $noreg, killed renamable $dil :: (volatile store 1 into %ir.amx.tmm.0.shape.row7) + MOV16mr %stack.3, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col8) + LDTILECFG killed renamable $rsi, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 + renamable $rsi = MOV32ri64 @buf + renamable $rdi = MOV32ri64 32 + renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rsi, 1, killed renamable $rdi, 0, $noreg + renamable $rsi = MOV32ri64 64 + PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm2 + JMP_1 %bb.3 + + bb.2.if.else: + successors: %bb.3(0x80000000) + + ; CHECK-LABEL: bb.2.if.else + ; tmm3 --> row_offset = 51, col_offset = 22 + ; CHECK: MOV8mr %stack.1, 1, $noreg, 51, $noreg, killed renamable $sil :: (volatile store 1 into %ir.amx.tmm.0.shape.row11) + ; CHECK: MOV16mi %stack.1, 1, $noreg, 22, $noreg, 8 :: (volatile store 2 into %ir.amx.tmm.0.shape.col12) + ; CHECK: LDTILECFG %stack.1, 1, $noreg, 0, $noreg + ; CHECK: renamable $tmm3 = PTILELOADDV renamable $ax, renamable $si, renamable $r9, 1, renamable $r10, 0, $noreg + ; CHECK: PTILESTOREDV renamable $ax, renamable $si, renamable $r11, 1, renamable $r8, 0, $noreg, killed renamable $tmm3 + + ; tmm4 --> row_offset = 52, col_offset = 24 + ; CHECK: MOV8mi %stack.7, 1, $noreg, 52, $noreg, 8 :: (volatile store 1 into %ir.amx.tmm.0.shape.row) + ; CHECK: MOV16mr %stack.7, 1, $noreg, 24, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col) + ; CHECK: LDTILECFG %stack.7, 1, $noreg, 0, $noreg + ; CHECK: renamable $tmm4 = PTILELOADDV renamable $si, renamable $cx, killed renamable $r9, 1, killed renamable $r10, 0, $noreg + ; CHECK: PTILESTOREDV killed renamable $si, renamable $cx, renamable $rdi, 1, killed renamable $r8, 0, $noreg, killed renamable $tmm4 + + ; tmm4 --> row_offset = 53, col_offset = 26 + ; CHECK: MOV8mr %stack.0, 1, $noreg, 53, $noreg, killed renamable $dil :: (volatile store 1 into %ir.amx.tmm.0.shape.row13) + ; CHECK: MOV16mr %stack.0, 1, $noreg, 26, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col14) + ; CHECK: LDTILECFG killed renamable $rsi, 1, $noreg, 0, $noreg + ; CHECK: renamable $tmm5 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rsi, 1, killed renamable $rdi, 0, $noreg + ; CHECK: PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm5 + + $ax = MOV16rm %stack.16, 1, $noreg, 0, $noreg :: (load 2 from %stack.16) + $cx = MOV16rm %stack.17, 1, $noreg, 0, $noreg :: (load 2 from %stack.17) + $rdx = MOV64rm %stack.15, 1, $noreg, 0, $noreg :: (load 8 from %stack.15) + $rdi = MOV64rm %stack.14, 1, $noreg, 0, $noreg :: (load 8 from %stack.14) + $r11 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load 8 from %stack.13) + renamable $zmm0 = AVX512_512_SET0 + VMOVDQA64Zmr %stack.1, 1, $noreg, 0, $noreg, renamable $zmm0 :: (store 64 into %ir.1) + renamable $sil = COPY renamable $al + MOV8mr %stack.1, 1, $noreg, 48, $noreg, killed renamable $sil :: (volatile store 1 into %ir.amx.tmm.0.shape.row11) + MOV16mi %stack.1, 1, $noreg, 16, $noreg, 8 :: (volatile store 2 into %ir.amx.tmm.0.shape.col12) + LDTILECFG %stack.1, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 + renamable $r9 = MOV32ri64 @buf2 + renamable $r10 = MOV32ri64 32 + renamable $si = MOV16ri 8 + renamable $tmm3 = PTILELOADDV renamable $ax, renamable $si, renamable $r9, 1, renamable $r10, 0, $noreg + renamable $r8 = MOV32ri64 64 + PTILESTOREDV renamable $ax, renamable $si, renamable $r11, 1, renamable $r8, 0, $noreg, killed renamable $tmm3 + VMOVDQA64Zmr %stack.7, 1, $noreg, 0, $noreg, renamable $zmm0 :: (store 64 into %ir.7) + MOV8mi %stack.7, 1, $noreg, 48, $noreg, 8 :: (volatile store 1 into %ir.amx.tmm.0.shape.row) + MOV16mr %stack.7, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col) + LDTILECFG %stack.7, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 + renamable $tmm4 = PTILELOADDV renamable $si, renamable $cx, killed renamable $r9, 1, killed renamable $r10, 0, $noreg + PTILESTOREDV killed renamable $si, renamable $cx, renamable $rdi, 1, killed renamable $r8, 0, $noreg, killed renamable $tmm4 + renamable $rsi = LEA64r %stack.0, 1, $noreg, 0, $noreg + VMOVDQA64Zmr %stack.0, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store 64 into %ir.0) + renamable $dil = COPY renamable $al + MOV8mr %stack.0, 1, $noreg, 48, $noreg, killed renamable $dil :: (volatile store 1 into %ir.amx.tmm.0.shape.row13) + MOV16mr %stack.0, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col14) + LDTILECFG killed renamable $rsi, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 + renamable $rsi = MOV32ri64 @buf2 + renamable $rdi = MOV32ri64 32 + renamable $tmm5 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rsi, 1, killed renamable $rdi, 0, $noreg + renamable $rsi = MOV32ri64 64 + PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm5 + + bb.3.if.end: + ; CHECK-LABEL: bb.3.if.end + ; tmm0 --> row_offset = 48, col_offset = 16 + ; tmm1 --> row_offset = 49, col_offset = 18 + ; tmm2 --> row_offset = 50, col_offset = 20 + ; CHECK: MOV8mr %stack.4, 1, $noreg, 48, $noreg, renamable $sil :: (volatile store 1 into %ir.amx.tmm.0.shape.row5) + ; CHECK: MOV16mr %stack.4, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col6) + ; CHECK: MOV8mr %stack.4, 1, $noreg, 49, $noreg, renamable $sil :: (volatile store 1 into %ir.amx.tmm.1.shape.row) + ; CHECK: MOV16mi %stack.4, 1, $noreg, 18, $noreg, 8 :: (volatile store 2 into %ir.amx.tmm.1.shape.col) + ; CHECK: MOV8mi %stack.4, 1, $noreg, 50, $noreg, 8 :: (volatile store 1 into %ir.amx.tmm.2.shape.row) + ; CHECK: MOV16mr %stack.4, 1, $noreg, 20, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.2.shape.col) + ; CHECK: MOV8mr %stack.4, 1, $noreg, 48, $noreg, killed renamable $sil :: (volatile store 1 into %ir.amx.tmm.3.shape.row) + ; CHECK: MOV16mr %stack.4, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.3.shape.col) + ; CHECK: LDTILECFG %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0 + ; CHECK: renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r10, 1, renamable $rsi, 0, $noreg + ; CHECK: renamable $tmm2 = PTILELOADDV renamable $di, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg + ; CHECK: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r8, 1, renamable $rsi, 0, $noreg + ; CHECK: renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2 + ; CHECK: PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0 + + ; tmm6 --> row_offset = 54, col_offset = 28 + ; CHECK: MOV8mr %stack.5, 1, $noreg, 54, $noreg, killed renamable $dil :: (volatile store 1 into %ir.amx.tmm.0.shape.row3) + ; CHECK: MOV16mr %stack.5, 1, $noreg, 28, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col4) + ; CHECK: LDTILECFG killed renamable $rsi, 1, $noreg, 0, $noreg + ; CHECK: renamable $tmm6 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg + ; CHECK: PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm6 + + $ax = MOV16rm %stack.16, 1, $noreg, 0, $noreg :: (load 2 from %stack.16) + $cx = MOV16rm %stack.17, 1, $noreg, 0, $noreg :: (load 2 from %stack.17) + $rdx = MOV64rm %stack.12, 1, $noreg, 0, $noreg :: (load 8 from %stack.12) + $r8 = MOV64rm %stack.15, 1, $noreg, 0, $noreg :: (load 8 from %stack.15) + $r9 = MOV64rm %stack.14, 1, $noreg, 0, $noreg :: (load 8 from %stack.14) + $r10 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load 8 from %stack.13) + renamable $zmm0 = AVX512_512_SET0 + VMOVDQA64Zmr %stack.4, 1, $noreg, 0, $noreg, renamable $zmm0 :: (store 64 into %ir.4) + renamable $sil = COPY renamable $al + MOV8mr %stack.4, 1, $noreg, 48, $noreg, renamable $sil :: (volatile store 1 into %ir.amx.tmm.0.shape.row5) + MOV16mr %stack.4, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col6) + MOV8mr %stack.4, 1, $noreg, 49, $noreg, renamable $sil :: (volatile store 1 into %ir.amx.tmm.1.shape.row) + MOV16mi %stack.4, 1, $noreg, 18, $noreg, 8 :: (volatile store 2 into %ir.amx.tmm.1.shape.col) + MOV8mi %stack.4, 1, $noreg, 50, $noreg, 8 :: (volatile store 1 into %ir.amx.tmm.2.shape.row) + MOV16mr %stack.4, 1, $noreg, 20, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.2.shape.col) + MOV8mr %stack.4, 1, $noreg, 51, $noreg, killed renamable $sil :: (volatile store 1 into %ir.amx.tmm.3.shape.row) + MOV16mr %stack.4, 1, $noreg, 22, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.3.shape.col) + LDTILECFG %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 + renamable $rsi = MOV32ri64 64 + renamable $di = MOV16ri 8 + renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r10, 1, renamable $rsi, 0, $noreg + renamable $tmm2 = PTILELOADDV renamable $di, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg + renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r8, 1, renamable $rsi, 0, $noreg + renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2 + PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0 + renamable $rsi = LEA64r %stack.5, 1, $noreg, 0, $noreg + VMOVDQA64Zmr %stack.5, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store 64 into %ir.5) + renamable $dil = COPY renamable $al + MOV8mr %stack.5, 1, $noreg, 48, $noreg, killed renamable $dil :: (volatile store 1 into %ir.amx.tmm.0.shape.row3) + MOV16mr %stack.5, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col4) + LDTILECFG killed renamable $rsi, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 + renamable $rsi = MOV32ri64 64 + renamable $tmm6 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg + renamable $rdx = MOV32ri64 @buf + renamable $rsi = MOV32ri64 32 + PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm6 + RETQ + +... Index: llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll =================================================================== --- llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll +++ llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics %s -S | FileCheck %s +; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s define dso_local void @test_no_bitcast(i32* %A_mem, i32* %B_mem, i32* %C_mem) local_unnamed_addr #0 { ; CHECK-LABEL: @test_no_bitcast( Index: llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll =================================================================== --- llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll +++ llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics %s -S | FileCheck %s +; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, i8 *%ptr, i64 %stride, <256 x i32>* %vptr) { ; CHECK-LABEL: @test_amx_load_non_O0( Index: llvm/test/CodeGen/X86/O0-pipeline.ll =================================================================== --- llvm/test/CodeGen/X86/O0-pipeline.ll +++ llvm/test/CodeGen/X86/O0-pipeline.ll @@ -20,6 +20,7 @@ ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Lower AMX intrinsics ; CHECK-NEXT: Lower AMX type for load/store +; CHECK-NEXT: Pre AMX Tile Config ; CHECK-NEXT: Module Verifier ; CHECK-NEXT: Lower Garbage Collection Instructions ; CHECK-NEXT: Shadow Stack GC Lowering @@ -45,6 +46,7 @@ ; CHECK-NEXT: Eliminate PHI nodes for register allocation ; CHECK-NEXT: Two-Address instruction pass ; CHECK-NEXT: Fast Register Allocator +; CHECK-NEXT: Fast Tile Register Configure ; CHECK-NEXT: X86 Lower Tile Copy ; CHECK-NEXT: Bundle Machine CFG Edges ; CHECK-NEXT: X86 FP Stackifier Index: llvm/tools/opt/opt.cpp =================================================================== --- llvm/tools/opt/opt.cpp +++ llvm/tools/opt/opt.cpp @@ -520,8 +520,8 @@ "expand-reductions", "indirectbr-expand", "generic-to-nvvm", "expandmemcmp", "loop-reduce", "lower-amx-type", - "lower-amx-intrinsics", "polyhedral-info", - "replace-with-veclib"}; + "pre-amx-config", "lower-amx-intrinsics", + "polyhedral-info", "replace-with-veclib"}; for (const auto &P : PassNamePrefix) if (Pass.startswith(P)) return true; Index: llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn =================================================================== --- llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn +++ llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn @@ -87,6 +87,7 @@ "X86EvexToVex.cpp", "X86ExpandPseudo.cpp", "X86FastISel.cpp", + "X86FastTileConfig.cpp", "X86FixupBWInsts.cpp", "X86FixupLEAs.cpp", "X86FixupSetCC.cpp", @@ -110,6 +111,7 @@ "X86LoadValueInjectionRetHardening.cpp", "X86LowerAMXIntrinsics.cpp", "X86LowerAMXType.cpp", + "X86PreAMXConfig.cpp", "X86LowerTileCopy.cpp", "X86MCInstLower.cpp", "X86MachineFunctionInfo.cpp",