diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp --- a/llvm/lib/Target/X86/X86PreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp @@ -83,8 +83,8 @@ MIRef FirstAMX; MIRef LastCall; MIRef LastShape; + bool TileCfgForbidden = false; bool NeedTileCfgLiveIn = false; - unsigned ShapeReachedCount = 0; }; class X86PreTileConfig : public MachineFunctionPass { @@ -256,19 +256,17 @@ if (CfgNeedInsert.empty()) return false; - // Calculate how many times the ShapeBB can reach to this BB. - unsigned ShapeBBNum = 0; - for (auto *MBB : ShapeBBs) { - SmallSet VistedBB; - SmallVector WorkList({MBB}); - while (!WorkList.empty()) { - MachineBasicBlock *MBB = WorkList.pop_back_val(); - ++BBVisitedInfo[MBB].ShapeReachedCount; - for (auto *Succ : MBB->successors()) - if (VistedBB.insert(Succ).second && !isLoopBackEdge(Succ, MBB)) - WorkList.push_back(Succ); + // Avoid to insert ldtilecfg before any shape defs. + SmallVector WorkList( + make_range(ShapeBBs.begin(), ShapeBBs.end())); + while (!WorkList.empty()) { + MachineBasicBlock *MBB = WorkList.pop_back_val(); + for (auto *Pred : MBB->predecessors()) { + if (!BBVisitedInfo[Pred].TileCfgForbidden && !isLoopBackEdge(MBB, Pred)) { + BBVisitedInfo[Pred].TileCfgForbidden = true; + WorkList.push_back(Pred); + } } - ++ShapeBBNum; } DebugLoc DL; @@ -283,7 +281,7 @@ while (!WorkList.empty()) { MIRef I = WorkList.pop_back_val(); if (!VisitedOrInserted.count(I)) { - if (BBVisitedInfo[I.MBB].ShapeReachedCount == ShapeBBNum) { + if (!BBVisitedInfo[I.MBB].TileCfgForbidden) { // If the BB is all shapes reachable, stop sink and try to insert. InsertPoints.insert(I); } else { @@ -355,6 +353,8 @@ addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS, 48) .addReg(Xmm); } + // Fill in the palette first. + addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV8mi)), SS).addImm(1); return true; } diff --git a/llvm/lib/Target/X86/X86TileConfig.cpp b/llvm/lib/Target/X86/X86TileConfig.cpp --- a/llvm/lib/Target/X86/X86TileConfig.cpp +++ b/llvm/lib/Target/X86/X86TileConfig.cpp @@ -22,9 +22,7 @@ #include "X86MachineFunctionInfo.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" -#include "llvm/ADT/PostOrderIterator.h" #include "llvm/CodeGen/LiveIntervals.h" -#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" @@ -42,28 +40,20 @@ namespace { -class X86TileConfig : public MachineFunctionPass { - // context - MachineFunction *MF = nullptr; - const X86Subtarget *ST = nullptr; - const TargetRegisterInfo *TRI; - const TargetInstrInfo *TII; - MachineDominatorTree *DomTree = nullptr; - MachineRegisterInfo *MRI = nullptr; - VirtRegMap *VRM = nullptr; - LiveIntervals *LIS = nullptr; - - MachineInstr *getTileConfigPoint(); - void tileConfig(); - -public: +struct X86TileConfig : public MachineFunctionPass { + X86TileConfig() : MachineFunctionPass(ID) {} /// Return the pass name. StringRef getPassName() const override { return "Tile Register Configure"; } /// X86TileConfig analysis usage. - void getAnalysisUsage(AnalysisUsage &AU) const override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + AU.addRequired(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } /// Perform register allocation. bool runOnMachineFunction(MachineFunction &mf) override; @@ -82,168 +72,119 @@ INITIALIZE_PASS_BEGIN(X86TileConfig, "tileconfig", "Tile Register Configure", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(VirtRegMap) INITIALIZE_PASS_END(X86TileConfig, "tileconfig", "Tile Register Configure", false, false) -void X86TileConfig::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired(); - AU.addRequired(); - AU.addPreserved(); - AU.addRequired(); - AU.setPreservesAll(); - MachineFunctionPass::getAnalysisUsage(AU); -} - -static unsigned getTilePhysRegIndex(Register PhysReg) { - assert((PhysReg >= X86::TMM0 && X86::TMM0 <= X86::TMM7) && - "Tile register number is invalid"); - return (PhysReg - X86::TMM0); -} - -static MachineInstr * -storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - Register SrcReg, unsigned BitSize, int FrameIdx, int Offset, - const TargetInstrInfo *TII, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) { - - unsigned SubIdx = (BitSize == 8) ? X86::sub_8bit : X86::sub_16bit; - unsigned Opc = (BitSize == 8) ? X86::MOV8mr : X86::MOV16mr; - if (BitSize == TRI->getRegSizeInBits(*RC)) - SubIdx = 0; - MachineInstr *NewMI = - addFrameReference(BuildMI(MBB, MI, DebugLoc(), TII->get(Opc)), FrameIdx, - Offset) - .addReg(SrcReg, 0, SubIdx); - return NewMI; -} +bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) { + const X86Subtarget &ST = MF.getSubtarget(); + const TargetRegisterInfo *TRI = ST.getRegisterInfo(); + const TargetInstrInfo *TII = ST.getInstrInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + LiveIntervals &LIS = getAnalysis(); + VirtRegMap &VRM = getAnalysis(); -static MachineInstr *storeImmToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - int64_t Imm, unsigned BitSize, - int FrameIdx, int Offset, - const TargetInstrInfo *TII) { - unsigned Opc = (BitSize == 8) ? X86::MOV8mi : X86::MOV16mi; - return addFrameReference(BuildMI(MBB, MI, DebugLoc(), TII->get(Opc)), - FrameIdx, Offset) - .addImm(Imm); -} + if (VRM.isShapeMapEmpty()) + return false; -MachineInstr *X86TileConfig::getTileConfigPoint() { - MachineBasicBlock *Entry = &*MF->begin(); - ReversePostOrderTraversal RPOT(Entry); - for (MachineBasicBlock *MBB : RPOT) { - for (MachineInstr &MI : *MBB) - // Refer X86PreTileConfig.cpp. - // We only support one tile config for now. The other ldtilecfg - // is for spill purpose and is dominated by the first ldtilecfg. - if (MI.getOpcode() == X86::LDTILECFG) - return &MI; + int SS = INT_MAX; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() == X86::LDTILECFG) { + SS = MI.getOperand(0).getIndex(); + break; + } + } + if (SS != INT_MAX) + break; } - return nullptr; -} + // Try to find a point to insert MIs for constant shapes. + // Here we are leveraging the palette id inserted in PreRA pass. + unsigned ConstPos = 0; + MachineInstr *ConstMI = nullptr; + for (MachineInstr &MI : MF.front()) { + if (MI.getOpcode() == X86::MOV8mi && SS == MI.getOperand(0).getIndex()) { + ConstMI = &MI; + break; + } + ++ConstPos; + } + assert(ConstMI && "Cannot find an insertion point"); -void X86TileConfig::tileConfig() { - MachineInstr *MI = getTileConfigPoint(); - if (!MI) - return; - MachineBasicBlock *MBB = MI->getParent(); - int SS = MI->getOperand(0).getIndex(); - BitVector PhysRegs(TRI->getNumRegs()); - - // Fill in the palette first. - auto *NewMI = storeImmToStackSlot(*MBB, *MI, 1, 8, SS, 0, TII); - LIS->InsertMachineInstrInMaps(*NewMI); - // Fill in the shape of each tile physical register. - for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) { - Register VirtReg = Register::index2VirtReg(i); - if (MRI->reg_nodbg_empty(VirtReg)) + unsigned AMXRegNum = TRI->getRegClass(X86::TILERegClassID)->getNumRegs(); + SmallVector Phys2Virt(AMXRegNum, 0); + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + Register VirtReg = Register::index2VirtReg(I); + if (MRI.reg_nodbg_empty(VirtReg)) continue; - const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); - if (RC.getID() != X86::TILERegClassID) + if (MRI.getRegClass(VirtReg)->getID() != X86::TILERegClassID) continue; - Register PhysReg = VRM->getPhys(VirtReg); - if (PhysRegs.test(PhysReg)) + unsigned Index = VRM.getPhys(VirtReg) - X86::TMM0; + if (!Phys2Virt[Index]) + Phys2Virt[Index] = VirtReg; + } + + // Fill in the shape of each tile physical register. + for (unsigned I = 0; I < AMXRegNum; ++I) { + if (!Phys2Virt[I]) continue; - PhysRegs.set(PhysReg); - ShapeT Shape = VRM->getShape(VirtReg); - Register RowReg = Shape.getRow()->getReg(); - Register ColReg = Shape.getCol()->getReg(); - - // Here is the data format for the tile config. - // 0 palette - // 1 start_row - // 2-15 reserved, must be zero - // 16-17 tile0.colsb Tile 0 bytes per row. - // 18-19 tile1.colsb Tile 1 bytes per row. - // 20-21 tile2.colsb Tile 2 bytes per row. - // ... (sequence continues) - // 30-31 tile7.colsb Tile 7 bytes per row. - // 32-47 reserved, must be zero - // 48 tile0.rows Tile 0 rows. - // 49 tile1.rows Tile 1 rows. - // 50 tile2.rows Tile 2 rows. - // ... (sequence continues) - // 55 tile7.rows Tile 7 rows. - // 56-63 reserved, must be zero - unsigned Index = getTilePhysRegIndex(PhysReg); - int RowOffset = 48 + Index; - int ColOffset = 16 + Index * 2; - - unsigned BitSize = 8; - for (const auto &Pair : {std::make_pair(RowReg, RowOffset), - std::make_pair(ColReg, ColOffset)}) { - int64_t Imm; + DebugLoc DL; + bool IsRow = true; + MachineInstr *NewMI = nullptr; + ShapeT Shape = VRM.getShape(Phys2Virt[I]); + for (auto &R : {Shape.getRow()->getReg(), Shape.getCol()->getReg()}) { + // Here is the data format for the tile config. + // 0 palette + // 1 start_row + // 2-15 reserved, must be zero + // 16-17 tile0.colsb Tile 0 bytes per row. + // 18-19 tile1.colsb Tile 1 bytes per row. + // 20-21 tile2.colsb Tile 2 bytes per row. + // ... (sequence continues) + // 30-31 tile7.colsb Tile 7 bytes per row. + // 32-47 reserved, must be zero + // 48 tile0.rows Tile 0 rows. + // 49 tile1.rows Tile 1 rows. + // 50 tile2.rows Tile 2 rows. + // ... (sequence continues) + // 55 tile7.rows Tile 7 rows. + // 56-63 reserved, must be zero int ImmCount = 0; - // All def must be the same value, otherwise it is invalid MIs. - // Immediate is prefered. - for (const MachineOperand &MO : MRI->def_operands(Pair.first)) { - const auto *Inst = MO.getParent(); - if (Inst->isMoveImmediate()) { - ImmCount++; - Imm = Inst->getOperand(1).getImm(); - break; + int Offset = IsRow ? 48 + I : 16 + I * 2; + for (auto &DefMI : MRI.def_instructions(R)) { + MachineBasicBlock &MBB = *DefMI.getParent(); + if (DefMI.isMoveImmediate()) { + // FIXME: We should handle this case in future. + assert(++ImmCount == 1 && "Cannot initialize with different shapes"); + NewMI = addFrameReference( + BuildMI(MF.front(), ++ConstMI->getIterator(), DL, + TII->get(IsRow ? X86::MOV8mi : X86::MOV16mi)), + SS, Offset) + .addImm(DefMI.getOperand(1).getImm()); + ConstMI = NewMI; + LIS.InsertMachineInstrInMaps(*NewMI); + } else { + unsigned SubIdx = IsRow ? X86::sub_8bit : X86::sub_16bit; + unsigned RegSize = TRI->getRegSizeInBits(*MRI.getRegClass(R)); + if ((IsRow && RegSize == 8) || (!IsRow && RegSize == 16)) + SubIdx = 0; + auto Iter = DefMI.getIterator(); + if (&MBB == &MF.front() && + std::distance(MBB.instr_begin(), Iter) < ConstPos) + Iter = ConstMI->getIterator(); + NewMI = addFrameReference( + BuildMI(MBB, ++Iter, DL, + TII->get(IsRow ? X86::MOV8mr : X86::MOV16mr)), + SS, Offset) + .addReg(R, 0, SubIdx); + SlotIndex SIdx = LIS.InsertMachineInstrInMaps(*NewMI); + LIS.extendToIndices(LIS.getInterval(R), {SIdx.getRegSlot()}); } } - auto StoreConfig = [&](int Offset) { - MachineInstr *NewMI = nullptr; - if (ImmCount) - NewMI = storeImmToStackSlot(*MBB, *MI, Imm, BitSize, SS, Offset, TII); - else { - const TargetRegisterClass *RC = MRI->getRegClass(Pair.first); - NewMI = storeRegToStackSlot(*MBB, *MI, Pair.first, BitSize, SS, - Offset, TII, RC, TRI); - } - SlotIndex SIdx = LIS->InsertMachineInstrInMaps(*NewMI); - if (!ImmCount) { - // Extend the live interval. - SmallVector EndPoints = {SIdx.getRegSlot()}; - LiveInterval &Int = LIS->getInterval(Pair.first); - LIS->extendToIndices(Int, EndPoints); - } - }; - StoreConfig(Pair.second); - BitSize += 8; + IsRow = false; } } -} - -bool X86TileConfig::runOnMachineFunction(MachineFunction &mf) { - MF = &mf; - MRI = &mf.getRegInfo(); - ST = &mf.getSubtarget(); - TRI = ST->getRegisterInfo(); - TII = mf.getSubtarget().getInstrInfo(); - DomTree = &getAnalysis(); - VRM = &getAnalysis(); - LIS = &getAnalysis(); - - if (VRM->isShapeMapEmpty()) - return false; - - tileConfig(); return true; } diff --git a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll --- a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll @@ -29,12 +29,12 @@ ; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 ; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $buf, %eax ; CHECK-NEXT: movl $32, %r14d @@ -71,12 +71,12 @@ ; IPRA-NEXT: vpxord %zmm0, %zmm0, %zmm0 ; IPRA-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) ; IPRA-NEXT: movb $1, {{[0-9]+}}(%rsp) -; IPRA-NEXT: movb %dil, {{[0-9]+}}(%rsp) ; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp) ; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp) ; IPRA-NEXT: movw %si, {{[0-9]+}}(%rsp) ; IPRA-NEXT: movb %dil, {{[0-9]+}}(%rsp) ; IPRA-NEXT: movw %si, {{[0-9]+}}(%rsp) +; IPRA-NEXT: movb %dil, {{[0-9]+}}(%rsp) ; IPRA-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; IPRA-NEXT: movl $buf, %eax ; IPRA-NEXT: movl $32, %ecx @@ -115,11 +115,11 @@ ; CHECK-NEXT: movl %edi, %r14d ; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 ; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq foo ; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq foo ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: testl %r14d, %r14d ; CHECK-NEXT: jg .LBB2_4 @@ -274,6 +274,9 @@ ; CHECK-NEXT: movl %edi, %ebx ; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 ; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $buf, %r14d ; CHECK-NEXT: movl $32, %r15d ; CHECK-NEXT: movw $8, %bp @@ -282,9 +285,6 @@ ; CHECK-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo -; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: testl %ebx, %ebx ; CHECK-NEXT: jle .LBB3_3 diff --git a/llvm/test/CodeGen/X86/AMX/amx-config.ll b/llvm/test/CodeGen/X86/AMX/amx-config.ll --- a/llvm/test/CodeGen/X86/AMX/amx-config.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-config.ll @@ -12,15 +12,15 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpxord %zmm0, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %dx, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %dx, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %si, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: testl %edi, %edi ; AVX512-NEXT: movsbl %sil, %eax -; AVX512-NEXT: movb $1, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %si, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %dx, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %dx, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) ; AVX512-NEXT: je .LBB0_2 ; AVX512-NEXT: # %bb.1: @@ -46,15 +46,15 @@ ; AVX2-NEXT: vxorps %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movw %dx, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movw %dx, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movw %si, -{{[0-9]+}}(%rsp) ; AVX2-NEXT: testl %edi, %edi ; AVX2-NEXT: movsbl %sil, %eax -; AVX2-NEXT: movb $1, -{{[0-9]+}}(%rsp) ; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %si, -{{[0-9]+}}(%rsp) ; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %dx, -{{[0-9]+}}(%rsp) ; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %dx, -{{[0-9]+}}(%rsp) ; AVX2-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) ; AVX2-NEXT: je .LBB0_2 ; AVX2-NEXT: # %bb.1: @@ -82,15 +82,15 @@ ; SSE2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movw %dx, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movw %dx, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movw %si, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: testl %edi, %edi ; SSE2-NEXT: movsbl %sil, %eax -; SSE2-NEXT: movb $1, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %si, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %dx, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %dx, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) ; SSE2-NEXT: je .LBB0_2 ; SSE2-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll --- a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll @@ -8,12 +8,12 @@ ; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 ; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $buf, %eax ; CHECK-NEXT: movl $32, %ecx @@ -48,19 +48,19 @@ ; CHECK-NEXT: movl %edi, %ebp ; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 ; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq foo ; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq foo ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al @@ -121,6 +121,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 ; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB2_2 @@ -130,9 +133,7 @@ ; CHECK-NEXT: .LBB2_2: # %if.false ; CHECK-NEXT: decl %edi ; CHECK-NEXT: .LBB2_3: # %exit -; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) ; CHECK-NEXT: tilezero %tmm0 ; CHECK-NEXT: movl $buf, %eax @@ -158,25 +159,25 @@ ret void } -; TODO: There's PRA Tile Register Configure bug needs to fix later. define dso_local void @test4(i16 signext %0, i16 signext %1) nounwind { ; CHECK-LABEL: test4: ; CHECK: # %bb.0: ; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 ; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB3_3 ; CHECK-NEXT: # %bb.1: # %if.true ; CHECK-NEXT: incl %edi +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB3_4 ; CHECK-NEXT: .LBB3_2: # %amx2 -; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $32, %eax ; CHECK-NEXT: movl $buf+1024, %ecx ; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm0 @@ -187,11 +188,12 @@ ; CHECK-NEXT: retq ; CHECK-NEXT: .LBB3_3: # %if.false ; CHECK-NEXT: decl %edi +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB3_2 ; CHECK-NEXT: .LBB3_4: # %amx1 -; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) ; CHECK-NEXT: tilezero %tmm0 ; CHECK-NEXT: movl $buf, %eax ; CHECK-NEXT: movl $32, %ecx @@ -231,6 +233,9 @@ ; CHECK-NEXT: # kill: def $esi killed $esi def $rsi ; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 ; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: movl $buf, %r8d ; CHECK-NEXT: movl $32, %edx @@ -240,13 +245,11 @@ ; CHECK-NEXT: .LBB4_3: # %if.false ; CHECK-NEXT: # in Loop: Header=BB4_1 Depth=1 ; CHECK-NEXT: movl %ecx, %esi +; CHECK-NEXT: movw %cx, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: cmpw $7, %si ; CHECK-NEXT: jne .LBB4_5 ; CHECK-NEXT: .LBB4_1: # %loop.bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB4_3 @@ -291,6 +294,8 @@ ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 ; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: xorl %r8d, %r8d ; CHECK-NEXT: movl $buf, %ecx ; CHECK-NEXT: movl $32, %edx @@ -303,10 +308,8 @@ ; CHECK-NEXT: .LBB5_4: # %loop.bb2 ; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1 ; CHECK-NEXT: leal (%rdi,%rsi), %eax -; CHECK-NEXT: cmpw $7, %si -; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: cmpw $7, %si ; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) ; CHECK-NEXT: tilezero %tmm0 ; CHECK-NEXT: tilestored %tmm0, (%rcx,%rdx) diff --git a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll --- a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll @@ -16,20 +16,20 @@ ; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 ; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $32, %r14d ; CHECK-NEXT: movl $buf+2048, %r15d diff --git a/llvm/test/CodeGen/X86/AMX/amx-spill.ll b/llvm/test/CodeGen/X86/AMX/amx-spill.ll --- a/llvm/test/CodeGen/X86/AMX/amx-spill.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-spill.ll @@ -11,12 +11,9 @@ ; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 ; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %dl, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %dl, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %si, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %sil, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %dl, {{[0-9]+}}(%rsp) @@ -24,9 +21,12 @@ ; CHECK-NEXT: movb %dl, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %si, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %dl, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %sil, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %dl, {{[0-9]+}}(%rsp) ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $buf, %r8d ; CHECK-NEXT: movl $32, %eax