diff --git a/llvm/lib/Target/X86/X86RegisterInfo.h b/llvm/lib/Target/X86/X86RegisterInfo.h --- a/llvm/lib/Target/X86/X86RegisterInfo.h +++ b/llvm/lib/Target/X86/X86RegisterInfo.h @@ -120,6 +120,8 @@ bool isArgumentRegister(const MachineFunction &MF, MCRegister Reg) const override; + bool isTileRegisterClass(const TargetRegisterClass *RC) const; + /// Returns true if PhysReg is a fixed register. bool isFixedRegister(const MachineFunction &MF, MCRegister PhysReg) const override; diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -676,6 +676,10 @@ return X86GenRegisterInfo::isFixedRegister(MF, PhysReg); } +bool X86RegisterInfo::isTileRegisterClass(const TargetRegisterClass *RC) const { + return RC->getID() == X86::TILERegClassID; +} + void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const { // Check if the EFLAGS register is marked as live-out. This shouldn't happen, // because the calling convention defines the EFLAGS register as NOT diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -57,6 +57,10 @@ static cl::opt EnableMachineCombinerPass("x86-machine-combiner", cl::desc("Enable the machine combiner pass"), cl::init(true), cl::Hidden); +static cl::opt + EnableTileRAPass("x86-tile-ra", + cl::desc("Enable the tile register allocation pass"), + cl::init(true), cl::Hidden); extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() { // Register the target. @@ -387,6 +391,7 @@ void addPreEmitPass2() override; void addPreSched2() override; bool addPreRewrite() override; + bool addRegAssignAndRewriteFast() override; std::unique_ptr getCSEConfig() const override; }; @@ -607,6 +612,18 @@ })); } +static bool onlyAllocateTileRegisters(const TargetRegisterInfo &TRI, + const TargetRegisterClass &RC) { + return static_cast(TRI).isTileRegisterClass(&RC); +} + +bool X86PassConfig::addRegAssignAndRewriteFast() { + // Allocate AMX registers separately. + if (EnableTileRAPass) + addPass(createFastRegisterAllocator(onlyAllocateTileRegisters, false)); + return TargetPassConfig::addRegAssignAndRewriteFast(); +} + bool X86PassConfig::addPostFastRegAllocRewrite() { addPass(createX86FastTileConfigPass()); return true; diff --git a/llvm/test/CodeGen/X86/AMX/amx-regalloc.ll b/llvm/test/CodeGen/X86/AMX/amx-regalloc.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/AMX/amx-regalloc.ll @@ -0,0 +1,118 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck -check-prefix=O0 %s + +define dso_local void @foo(i32 noundef %t, i16 %row, i16 %col) nounwind { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %dx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %sil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %dx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %sil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %dx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %sil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) +; CHECK-NEXT: tilezero %tmm2 +; CHECK-NEXT: tilezero %tmm3 +; CHECK-NEXT: tilezero %tmm4 +; CHECK-NEXT: #APP +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: tilezero %tmm0 +; CHECK-NEXT: tilezero %tmm1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: tdpbssd %tmm3, %tmm2, %tmm4 +; CHECK-NEXT: tilerelease +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; O0-LABEL: foo: +; O0: # %bb.0: # %entry +; O0-NEXT: pushq %rbp +; O0-NEXT: movq %rsp, %rbp +; O0-NEXT: andq $-1024, %rsp # imm = 0xFC00 +; O0-NEXT: subq $6144, %rsp # imm = 0x1800 +; O0-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; O0-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) +; O0-NEXT: movb $1, {{[0-9]+}}(%rsp) +; O0-NEXT: movw %dx, %cx +; O0-NEXT: movw %si, %ax +; O0-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) +; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) +; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) +; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) +; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) +; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) +; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) +; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; O0-NEXT: tilezero %tmm0 +; O0-NEXT: movl $64, %esi +; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; O0-NEXT: tilestored %tmm0, (%rdx,%rsi) +; O0-NEXT: tilezero %tmm0 +; O0-NEXT: movl $64, %esi +; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; O0-NEXT: tilestored %tmm0, (%rdx,%rsi) +; O0-NEXT: tilezero %tmm0 +; O0-NEXT: movl $64, %esi +; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; O0-NEXT: tilestored %tmm0, (%rdx,%rsi) +; O0-NEXT: #APP +; O0-NEXT: movl %edi, %eax +; O0-NEXT: movl %edi, %edx +; O0-NEXT: addl %eax, %edx +; O0-NEXT: tilezero %tmm0 +; O0-NEXT: tilezero %tmm1 +; O0-EMPTY: +; O0-NEXT: #NO_APP +; O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload +; O0-NEXT: movl $64, %esi +; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; O0-NEXT: tileloadd (%rdx,%rsi), %tmm1 +; O0-NEXT: movl $64, %esi +; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; O0-NEXT: tileloadd (%rdx,%rsi), %tmm2 +; O0-NEXT: movl $64, %esi +; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; O0-NEXT: tileloadd (%rdx,%rsi), %tmm0 +; O0-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 +; O0-NEXT: movl $64, %esi +; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; O0-NEXT: tilestored %tmm0, (%rdx,%rsi) +; O0-NEXT: movq %rbp, %rsp +; O0-NEXT: popq %rbp +; O0-NEXT: tilerelease +; O0-NEXT: vzeroupper +; O0-NEXT: retq +entry: + %0 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col) + %1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col) + %2 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col) + ; clobber tmm register + %3 = tail call i32 asm sideeffect "mov $1, %eax;mov $1, $0;add %eax, $0;tilezero %tmm0;tilezero %tmm1;", "=r,r,~{eax},~{tmm0},~{tmm1},~{dirflag},~{fpsr},~{flags}"(i32 %t) + %4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 %col, x86_amx %2, x86_amx %0, x86_amx %1) + ret void +} + +declare x86_amx @llvm.x86.tilezero.internal(i16, i16) +declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) diff --git a/llvm/test/CodeGen/X86/AMX/amx-regalloc2.ll b/llvm/test/CodeGen/X86/AMX/amx-regalloc2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/AMX/amx-regalloc2.ll @@ -0,0 +1,57 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs -stop-after=regallocfast | FileCheck %s + +define dso_local void @foo(i32 noundef %t, i16 %row, i16 %col) nounwind { + ; CHECK-LABEL: name: foo + ; CHECK: bb.0.entry: + ; CHECK-NEXT: liveins: $edi, $esi, $edx + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0 + ; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.4, align 4) + ; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY killed $edx + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY killed $esi + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY killed $edi + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr16 = COPY [[COPY1]].sub_16bit + ; CHECK-NEXT: PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.4, align 4) + ; CHECK-NEXT: renamable $tmm0 = PTILEZEROV [[COPY4]], [[COPY3]] + ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 64 + ; CHECK-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r %stack.3, 1, $noreg, 0, $noreg + ; CHECK-NEXT: PTILESTOREDV [[COPY4]], [[COPY3]], killed [[LEA64r]], 1, killed [[MOV32ri64_]], 0, $noreg, killed renamable $tmm0 + ; CHECK-NEXT: renamable $tmm0 = PTILEZEROV [[COPY4]], [[COPY3]] + ; CHECK-NEXT: [[MOV32ri64_1:%[0-9]+]]:gr64_nosp = MOV32ri64 64 + ; CHECK-NEXT: [[LEA64r1:%[0-9]+]]:gr64 = LEA64r %stack.2, 1, $noreg, 0, $noreg + ; CHECK-NEXT: PTILESTOREDV [[COPY4]], [[COPY3]], killed [[LEA64r1]], 1, killed [[MOV32ri64_1]], 0, $noreg, killed renamable $tmm0 + ; CHECK-NEXT: renamable $tmm0 = PTILEZEROV [[COPY4]], [[COPY3]] + ; CHECK-NEXT: [[MOV32ri64_2:%[0-9]+]]:gr64_nosp = MOV32ri64 64 + ; CHECK-NEXT: [[LEA64r2:%[0-9]+]]:gr64 = LEA64r %stack.1, 1, $noreg, 0, $noreg + ; CHECK-NEXT: PTILESTOREDV [[COPY4]], [[COPY3]], killed [[LEA64r2]], 1, killed [[MOV32ri64_2]], 0, $noreg, killed renamable $tmm0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gr32 = COPY [[COPY2]] + ; CHECK-NEXT: INLINEASM &"mov $1, %eax;mov $1, $0;add %eax, $0;tilezero %tmm0;tilezero %tmm1;", 1 /* sideeffect attdialect */, 2359306 /* regdef:GR32 */, def %17, 2359305 /* reguse:GR32 */, [[COPY5]], 12 /* clobber */, implicit-def dead early-clobber $eax, 12 /* clobber */, implicit-def dead early-clobber $tmm0, 12 /* clobber */, implicit-def dead early-clobber $tmm1, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags + ; CHECK-NEXT: [[MOV32ri64_3:%[0-9]+]]:gr64_nosp = MOV32ri64 64 + ; CHECK-NEXT: [[LEA64r3:%[0-9]+]]:gr64 = LEA64r %stack.3, 1, $noreg, 0, $noreg + ; CHECK-NEXT: renamable $tmm1 = PTILELOADDV [[COPY4]], [[COPY3]], killed [[LEA64r3]], 1, killed [[MOV32ri64_3]], 0, $noreg + ; CHECK-NEXT: [[MOV32ri64_4:%[0-9]+]]:gr64_nosp = MOV32ri64 64 + ; CHECK-NEXT: [[LEA64r4:%[0-9]+]]:gr64 = LEA64r %stack.2, 1, $noreg, 0, $noreg + ; CHECK-NEXT: renamable $tmm2 = PTILELOADDV [[COPY4]], [[COPY3]], killed [[LEA64r4]], 1, killed [[MOV32ri64_4]], 0, $noreg + ; CHECK-NEXT: [[MOV32ri64_5:%[0-9]+]]:gr64_nosp = MOV32ri64 64 + ; CHECK-NEXT: [[LEA64r5:%[0-9]+]]:gr64 = LEA64r %stack.1, 1, $noreg, 0, $noreg + ; CHECK-NEXT: renamable $tmm0 = PTILELOADDV [[COPY4]], [[COPY3]], killed [[LEA64r5]], 1, killed [[MOV32ri64_5]], 0, $noreg + ; CHECK-NEXT: renamable $tmm0 = PTDPBSSDV [[COPY4]], [[COPY3]], [[COPY3]], renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2 + ; CHECK-NEXT: [[MOV32ri64_6:%[0-9]+]]:gr64_nosp = MOV32ri64 64 + ; CHECK-NEXT: [[LEA64r6:%[0-9]+]]:gr64 = LEA64r %stack.0, 1, $noreg, 0, $noreg + ; CHECK-NEXT: PTILESTOREDV [[COPY4]], [[COPY3]], killed [[LEA64r6]], 1, killed [[MOV32ri64_6]], 0, $noreg, killed renamable $tmm0 + ; CHECK-NEXT: RET64 +entry: + %0 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col) + %1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col) + %2 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col) + ; clobber tmm register + %3 = tail call i32 asm sideeffect "mov $1, %eax;mov $1, $0;add %eax, $0;tilezero %tmm0;tilezero %tmm1;", "=r,r,~{eax},~{tmm0},~{tmm1},~{dirflag},~{fpsr},~{flags}"(i32 %t) + %4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 %col, x86_amx %2, x86_amx %0, x86_amx %1) + ret void +} + +declare x86_amx @llvm.x86.tilezero.internal(i16, i16) +declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -45,6 +45,7 @@ ; CHECK-NEXT: Eliminate PHI nodes for register allocation ; CHECK-NEXT: Two-Address instruction pass ; CHECK-NEXT: Fast Register Allocator +; CHECK-NEXT: Fast Register Allocator ; CHECK-NEXT: Fast Tile Register Configure ; CHECK-NEXT: X86 Lower Tile Copy ; CHECK-NEXT: Bundle Machine CFG Edges diff --git a/llvm/test/DebugInfo/MIR/InstrRef/survives-livedebugvars.mir b/llvm/test/DebugInfo/MIR/InstrRef/survives-livedebugvars.mir --- a/llvm/test/DebugInfo/MIR/InstrRef/survives-livedebugvars.mir +++ b/llvm/test/DebugInfo/MIR/InstrRef/survives-livedebugvars.mir @@ -1,5 +1,5 @@ # RUN: llc -start-after=phi-node-elimination -stop-after=virtregrewriter %s -mtriple=x86_64-unknown-unknown -o - -experimental-debug-variable-locations | FileCheck %s -# RUN: llc -O0 -start-after=phi-node-elimination -stop-after=regallocfast %s -mtriple=x86_64-unknown-unknown -o - -experimental-debug-variable-locations | FileCheck %s --check-prefix=FASTREG +# RUN: llc -O0 -start-after=phi-node-elimination -x86-tile-ra=0 -stop-after=regallocfast %s -mtriple=x86_64-unknown-unknown -o - -experimental-debug-variable-locations | FileCheck %s --check-prefix=FASTREG # # Test that DBG_INSTR_REFs can pass through livedebugvariables to the end of # regalloc without problem. Program body copied from