diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp --- a/llvm/lib/CodeGen/RegAllocFast.cpp +++ b/llvm/lib/CodeGen/RegAllocFast.cpp @@ -281,6 +281,7 @@ Register traceCopies(Register VirtReg) const; Register traceCopyChain(Register Reg) const; + bool shouldAllocateRegister(const Register Reg) const; int getStackSpaceFor(Register VirtReg); void spill(MachineBasicBlock::iterator Before, Register VirtReg, MCPhysReg AssignedReg, bool Kill, bool LiveOut); @@ -300,6 +301,12 @@ INITIALIZE_PASS(RegAllocFast, "regallocfast", "Fast Register Allocator", false, false) +bool RegAllocFast::shouldAllocateRegister(const Register Reg) const { + assert(Register::isVirtualRegister(Reg)); + const TargetRegisterClass &RC = *MRI->getRegClass(Reg); + return ShouldAllocateClass(*TRI, RC); +} + void RegAllocFast::setPhysRegState(MCPhysReg PhysReg, unsigned NewState) { for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI) RegUnitStates[*UI] = NewState; @@ -830,6 +837,8 @@ assert(MO.isUndef() && "expected undef use"); Register VirtReg = MO.getReg(); assert(Register::isVirtualRegister(VirtReg) && "Expected virtreg"); + if (!shouldAllocateRegister(VirtReg)) + return; LiveRegMap::const_iterator LRI = findLiveVirtReg(VirtReg); MCPhysReg PhysReg; @@ -855,6 +864,8 @@ /// (tied or earlyclobber) that may interfere with preassigned uses. void RegAllocFast::defineLiveThroughVirtReg(MachineInstr &MI, unsigned OpNum, Register VirtReg) { + if (!shouldAllocateRegister(VirtReg)) + return; LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg); if (LRI != LiveVirtRegs.end()) { MCPhysReg PrevReg = LRI->PhysReg; @@ -888,6 +899,8 @@ void RegAllocFast::defineVirtReg(MachineInstr &MI, unsigned OpNum, Register VirtReg, bool LookAtPhysRegUses) { assert(VirtReg.isVirtual() && "Not a virtual register"); + if (!shouldAllocateRegister(VirtReg)) + return; MachineOperand &MO = MI.getOperand(OpNum); LiveRegMap::iterator LRI; bool New; @@ -938,6 +951,8 @@ void RegAllocFast::useVirtReg(MachineInstr &MI, unsigned OpNum, Register VirtReg) { assert(VirtReg.isVirtual() && "Not a virtual register"); + if (!shouldAllocateRegister(VirtReg)) + return; MachineOperand &MO = MI.getOperand(OpNum); LiveRegMap::iterator LRI; bool New; @@ -1071,6 +1086,8 @@ assert(RegClassDefCounts.size() == TRI->getNumRegClasses()); if (Reg.isVirtual()) { + if (!shouldAllocateRegister(Reg)) + return; const TargetRegisterClass *OpRC = MRI->getRegClass(Reg); for (unsigned RCIdx = 0, RCIdxEnd = TRI->getNumRegClasses(); RCIdx != RCIdxEnd; ++RCIdx) { @@ -1130,6 +1147,8 @@ if (MO.isReg()) { Register Reg = MO.getReg(); if (Reg.isVirtual()) { + if (!shouldAllocateRegister(Reg)) + continue; if (MO.isDef()) { HasDef = true; HasVRegDef = true; @@ -1193,7 +1212,7 @@ } if (MO.isDef()) { - if (Reg.isVirtual()) + if (Reg.isVirtual() && shouldAllocateRegister(Reg)) DefOperandIndexes.push_back(I); addRegClassDefCounts(RegClassDefCounts, Reg); @@ -1283,6 +1302,10 @@ Register Reg = MO.getReg(); if (!Reg) continue; + if (Reg.isVirtual()) { + assert(!shouldAllocateRegister(Reg)); + continue; + } assert(Reg.isPhysical()); if (MRI->isReserved(Reg)) continue; @@ -1329,7 +1352,7 @@ if (!MO.isReg() || !MO.isUse()) continue; Register Reg = MO.getReg(); - if (!Reg.isVirtual()) + if (!Reg.isVirtual() || !shouldAllocateRegister(Reg)) continue; if (MO.isUndef()) { @@ -1356,7 +1379,7 @@ if (!MO.isReg() || !MO.isUse()) continue; Register Reg = MO.getReg(); - if (!Reg.isVirtual()) + if (!Reg.isVirtual() || !shouldAllocateRegister(Reg)) continue; assert(MO.isUndef() && "Should only have undef virtreg uses left"); @@ -1379,6 +1402,10 @@ Register Reg = MO.getReg(); if (!Reg) continue; + if (Reg.isVirtual()) { + assert(!shouldAllocateRegister(Reg)); + continue; + } assert(Reg.isPhysical() && "should have register assigned"); // We sometimes get odd situations like: @@ -1408,6 +1435,8 @@ for (Register Reg : MI.getUsedDebugRegs()) { if (!Register::isVirtualRegister(Reg)) continue; + if (!shouldAllocateRegister(Reg)) + continue; // Already spilled to a stackslot? int SS = StackSlotForVirtReg[Reg]; @@ -1448,7 +1477,7 @@ continue; Register Reg = MO.getReg(); - if (!Reg.isVirtual()) + if (!Reg.isVirtual() || !shouldAllocateRegister(Reg)) continue; DenseMap::iterator DI; diff --git a/llvm/lib/Target/X86/X86RegisterInfo.h b/llvm/lib/Target/X86/X86RegisterInfo.h --- a/llvm/lib/Target/X86/X86RegisterInfo.h +++ b/llvm/lib/Target/X86/X86RegisterInfo.h @@ -120,6 +120,8 @@ bool isArgumentRegister(const MachineFunction &MF, MCRegister Reg) const override; + bool isTileRegisterClass(const TargetRegisterClass *RC) const; + /// Returns true if PhysReg is a fixed register. bool isFixedRegister(const MachineFunction &MF, MCRegister PhysReg) const override; diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -676,6 +676,10 @@ return X86GenRegisterInfo::isFixedRegister(MF, PhysReg); } +bool X86RegisterInfo::isTileRegisterClass(const TargetRegisterClass *RC) const { + return RC->getID() == X86::TILERegClassID; +} + void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const { // Check if the EFLAGS register is marked as live-out. This shouldn't happen, // because the calling convention defines the EFLAGS register as NOT diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -57,6 +57,10 @@ static cl::opt EnableMachineCombinerPass("x86-machine-combiner", cl::desc("Enable the machine combiner pass"), cl::init(true), cl::Hidden); +static cl::opt + EnableTileRAPass("x86-tile-ra", + cl::desc("Enable the tile register allocation pass"), + cl::init(true), cl::Hidden); extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() { // Register the target. @@ -387,6 +391,7 @@ void addPreEmitPass2() override; void addPreSched2() override; bool addPreRewrite() override; + bool addRegAssignAndRewriteFast() override; std::unique_ptr getCSEConfig() const override; }; @@ -607,6 +612,18 @@ })); } +static bool onlyAllocateTileRegisters(const TargetRegisterInfo &TRI, + const TargetRegisterClass &RC) { + return static_cast(TRI).isTileRegisterClass(&RC); +} + +bool X86PassConfig::addRegAssignAndRewriteFast() { + // Allocate AMX registers separately. + if (EnableTileRAPass) + addPass(createFastRegisterAllocator(onlyAllocateTileRegisters, false)); + return TargetPassConfig::addRegAssignAndRewriteFast(); +} + bool X86PassConfig::addPostFastRegAllocRewrite() { addPass(createX86FastTileConfigPass()); return true; diff --git a/llvm/test/CodeGen/X86/AMX/amx-regalloc.ll b/llvm/test/CodeGen/X86/AMX/amx-regalloc.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/AMX/amx-regalloc.ll @@ -0,0 +1,118 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck -check-prefix=O0 %s + +define dso_local void @foo(i32 noundef %t, i16 %row, i16 %col) nounwind { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %dx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %sil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %dx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %sil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %dx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %sil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) +; CHECK-NEXT: tilezero %tmm2 +; CHECK-NEXT: tilezero %tmm3 +; CHECK-NEXT: tilezero %tmm4 +; CHECK-NEXT: #APP +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: tilezero %tmm0 +; CHECK-NEXT: tilezero %tmm1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: tdpbssd %tmm3, %tmm2, %tmm4 +; CHECK-NEXT: tilerelease +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; O0-LABEL: foo: +; O0: # %bb.0: # %entry +; O0-NEXT: pushq %rbp +; O0-NEXT: movq %rsp, %rbp +; O0-NEXT: andq $-1024, %rsp # imm = 0xFC00 +; O0-NEXT: subq $6144, %rsp # imm = 0x1800 +; O0-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; O0-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) +; O0-NEXT: movb $1, {{[0-9]+}}(%rsp) +; O0-NEXT: movw %dx, %cx +; O0-NEXT: movw %si, %ax +; O0-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) +; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) +; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) +; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) +; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) +; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) +; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) +; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; O0-NEXT: tilezero %tmm0 +; O0-NEXT: movl $64, %esi +; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; O0-NEXT: tilestored %tmm0, (%rdx,%rsi) +; O0-NEXT: tilezero %tmm0 +; O0-NEXT: movl $64, %esi +; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; O0-NEXT: tilestored %tmm0, (%rdx,%rsi) +; O0-NEXT: tilezero %tmm0 +; O0-NEXT: movl $64, %esi +; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; O0-NEXT: tilestored %tmm0, (%rdx,%rsi) +; O0-NEXT: #APP +; O0-NEXT: movl %edi, %eax +; O0-NEXT: movl %edi, %edx +; O0-NEXT: addl %eax, %edx +; O0-NEXT: tilezero %tmm0 +; O0-NEXT: tilezero %tmm1 +; O0-EMPTY: +; O0-NEXT: #NO_APP +; O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload +; O0-NEXT: movl $64, %esi +; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; O0-NEXT: tileloadd (%rdx,%rsi), %tmm1 +; O0-NEXT: movl $64, %esi +; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; O0-NEXT: tileloadd (%rdx,%rsi), %tmm2 +; O0-NEXT: movl $64, %esi +; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; O0-NEXT: tileloadd (%rdx,%rsi), %tmm0 +; O0-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 +; O0-NEXT: movl $64, %esi +; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; O0-NEXT: tilestored %tmm0, (%rdx,%rsi) +; O0-NEXT: movq %rbp, %rsp +; O0-NEXT: popq %rbp +; O0-NEXT: tilerelease +; O0-NEXT: vzeroupper +; O0-NEXT: retq +entry: + %0 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col) + %1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col) + %2 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col) + ; clobber tmm register + %3 = tail call i32 asm sideeffect "mov $1, %eax;mov $1, $0;add %eax, $0;tilezero %tmm0;tilezero %tmm1;", "=r,r,~{eax},~{tmm0},~{tmm1},~{dirflag},~{fpsr},~{flags}"(i32 %t) + %4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 %col, x86_amx %2, x86_amx %0, x86_amx %1) + ret void +} + +declare x86_amx @llvm.x86.tilezero.internal(i16, i16) +declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -45,6 +45,7 @@ ; CHECK-NEXT: Eliminate PHI nodes for register allocation ; CHECK-NEXT: Two-Address instruction pass ; CHECK-NEXT: Fast Register Allocator +; CHECK-NEXT: Fast Register Allocator ; CHECK-NEXT: Fast Tile Register Configure ; CHECK-NEXT: X86 Lower Tile Copy ; CHECK-NEXT: Bundle Machine CFG Edges diff --git a/llvm/test/DebugInfo/MIR/InstrRef/survives-livedebugvars.mir b/llvm/test/DebugInfo/MIR/InstrRef/survives-livedebugvars.mir --- a/llvm/test/DebugInfo/MIR/InstrRef/survives-livedebugvars.mir +++ b/llvm/test/DebugInfo/MIR/InstrRef/survives-livedebugvars.mir @@ -1,5 +1,5 @@ # RUN: llc -start-after=phi-node-elimination -stop-after=virtregrewriter %s -mtriple=x86_64-unknown-unknown -o - -experimental-debug-variable-locations | FileCheck %s -# RUN: llc -O0 -start-after=phi-node-elimination -stop-after=regallocfast %s -mtriple=x86_64-unknown-unknown -o - -experimental-debug-variable-locations | FileCheck %s --check-prefix=FASTREG +# RUN: llc -O0 -start-after=phi-node-elimination -x86-tile-ra=0 -stop-after=regallocfast %s -mtriple=x86_64-unknown-unknown -o - -experimental-debug-variable-locations | FileCheck %s --check-prefix=FASTREG # # Test that DBG_INSTR_REFs can pass through livedebugvariables to the end of # regalloc without problem. Program body copied from