diff --git a/llvm/include/llvm/CodeGen/TargetPassConfig.h b/llvm/include/llvm/CodeGen/TargetPassConfig.h --- a/llvm/include/llvm/CodeGen/TargetPassConfig.h +++ b/llvm/include/llvm/CodeGen/TargetPassConfig.h @@ -345,6 +345,9 @@ // Helper to verify the analysis is really immutable. void setOpt(bool &Opt, bool Val); + /// Return true if register allocator is specified by -regalloc=override. + bool isCustomizedRegAlloc(); + /// Methods with trivial inline returns are convenient points in the common /// codegen pass pipeline where targets may insert passes. Methods with /// out-of-line standard implementations are major CodeGen stages called by diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -1405,6 +1405,11 @@ return createTargetRegisterAllocator(Optimized); } +bool TargetPassConfig::isCustomizedRegAlloc() { + return RegAlloc != + (RegisterRegAlloc::FunctionPassCtor)&useDefaultRegisterAllocator; +} + bool TargetPassConfig::addRegAssignAndRewriteFast() { if (RegAlloc != (RegisterRegAlloc::FunctionPassCtor)&useDefaultRegisterAllocator && RegAlloc != (RegisterRegAlloc::FunctionPassCtor)&createFastRegisterAllocator) diff --git a/llvm/lib/Target/X86/X86RegisterInfo.h b/llvm/lib/Target/X86/X86RegisterInfo.h --- a/llvm/lib/Target/X86/X86RegisterInfo.h +++ b/llvm/lib/Target/X86/X86RegisterInfo.h @@ -120,6 +120,9 @@ bool isArgumentRegister(const MachineFunction &MF, MCRegister Reg) const override; + /// Return true if it is tile register class. + bool isTileRegisterClass(const TargetRegisterClass *RC) const; + /// Returns true if PhysReg is a fixed register. bool isFixedRegister(const MachineFunction &MF, MCRegister PhysReg) const override; diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -676,6 +676,10 @@ return X86GenRegisterInfo::isFixedRegister(MF, PhysReg); } +bool X86RegisterInfo::isTileRegisterClass(const TargetRegisterClass *RC) const { + return RC->getID() == X86::TILERegClassID; +} + void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const { // Check if the EFLAGS register is marked as live-out. This shouldn't happen, // because the calling convention defines the EFLAGS register as NOT diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -36,6 +36,7 @@ #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/DataLayout.h" @@ -58,6 +59,11 @@ cl::desc("Enable the machine combiner pass"), cl::init(true), cl::Hidden); +static cl::opt + EnableTileRAPass("x86-tile-ra", + cl::desc("Enable the tile register allocation pass"), + cl::init(true), cl::Hidden); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() { // Register the target. RegisterTargetMachine X(getTheX86_32Target()); @@ -386,7 +392,7 @@ void addPreEmitPass() override; void addPreEmitPass2() override; void addPreSched2() override; - bool addPreRewrite() override; + bool addRegAssignAndRewriteOptimized() override; std::unique_ptr getCSEConfig() const override; }; @@ -612,11 +618,21 @@ return true; } -bool X86PassConfig::addPreRewrite() { - addPass(createX86TileConfigPass()); - return true; -} - std::unique_ptr X86PassConfig::getCSEConfig() const { return getStandardCSEConfigForOpt(TM->getOptLevel()); } + +static bool onlyAllocateTileRegisters(const TargetRegisterInfo &TRI, + const TargetRegisterClass &RC) { + return static_cast(TRI).isTileRegisterClass(&RC); +} + +bool X86PassConfig::addRegAssignAndRewriteOptimized() { + // Don't support tile RA when RA is specified by command line "-regalloc". + if (!isCustomizedRegAlloc() && EnableTileRAPass) { + // Allocate tile register first. + addPass(createGreedyRegisterAllocator(onlyAllocateTileRegisters)); + addPass(createX86TileConfigPass()); + } + return TargetPassConfig::addRegAssignAndRewriteOptimized(); +} diff --git a/llvm/lib/Target/X86/X86TileConfig.cpp b/llvm/lib/Target/X86/X86TileConfig.cpp --- a/llvm/lib/Target/X86/X86TileConfig.cpp +++ b/llvm/lib/Target/X86/X86TileConfig.cpp @@ -36,7 +36,7 @@ using namespace llvm; -#define DEBUG_TYPE "tile-config" +#define DEBUG_TYPE "tileconfig" namespace { @@ -70,11 +70,11 @@ char X86TileConfig::ID = 0; -INITIALIZE_PASS_BEGIN(X86TileConfig, "tileconfig", "Tile Register Configure", +INITIALIZE_PASS_BEGIN(X86TileConfig, DEBUG_TYPE, "Tile Register Configure", false, false) INITIALIZE_PASS_DEPENDENCY(VirtRegMap) -INITIALIZE_PASS_END(X86TileConfig, "tileconfig", "Tile Register Configure", - false, false) +INITIALIZE_PASS_END(X86TileConfig, DEBUG_TYPE, "Tile Register Configure", false, + false) bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) { const X86Subtarget &ST = MF.getSubtarget(); @@ -123,6 +123,8 @@ continue; if (MRI.getRegClass(VirtReg)->getID() != X86::TILERegClassID) continue; + if (VRM.getPhys(VirtReg) == VirtRegMap::NO_PHYS_REG) + continue; unsigned Index = VRM.getPhys(VirtReg) - X86::TMM0; if (!Phys2Virt[Index]) Phys2Virt[Index] = VirtReg; diff --git a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll --- a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll @@ -485,14 +485,14 @@ ; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: subq $1088, %rsp # imm = 0x440 -; CHECK-NEXT: movl %edi, %ebx +; CHECK-NEXT: movl %edi, %r15d ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovups %zmm0, (%rsp) ; CHECK-NEXT: movb $1, (%rsp) ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $buf, %r14d -; CHECK-NEXT: movl $32, %r15d +; CHECK-NEXT: movl $32, %ebx ; CHECK-NEXT: movw $8, %bp ; CHECK-NEXT: movl $buf+2048, %r12d ; CHECK-NEXT: .p2align 4, 0x90 @@ -500,17 +500,17 @@ ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo ; CHECK-NEXT: ldtilecfg (%rsp) -; CHECK-NEXT: testl %ebx, %ebx +; CHECK-NEXT: testl %r15d, %r15d ; CHECK-NEXT: jle .LBB3_3 ; CHECK-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1 -; CHECK-NEXT: tileloadd (%r14,%r15), %tmm0 +; CHECK-NEXT: tileloadd (%r14,%rbx), %tmm0 ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tilestored %tmm0, 64(%rsp,%rax) # 1024-byte Folded Spill ; CHECK-NEXT: callq foo ; CHECK-NEXT: ldtilecfg (%rsp) ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tileloadd 64(%rsp,%rax), %tmm0 # 1024-byte Folded Reload -; CHECK-NEXT: tilestored %tmm0, (%r12,%r15) +; CHECK-NEXT: tilestored %tmm0, (%r12,%rbx) ; CHECK-NEXT: callq foo ; CHECK-NEXT: jmp .LBB3_1 ; CHECK-NEXT: .LBB3_3: diff --git a/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll @@ -0,0 +1,227 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs -stop-before virtregrewriter | FileCheck %s + +define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row_from, i32 %c_row_to, i32 %c_row_tile, i32 %c_col_from, i32 %c_col_to, i32 %c_col_tile) { + ; CHECK-LABEL: name: foo + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) + ; CHECK-NEXT: liveins: $esi, $edx, $rcx, $r8, $r9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $r9 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $r8 + ; CHECK-NEXT: MOV64mr %stack.1, 1, $noreg, 0, $noreg, $rcx :: (store (s64) into %stack.1) + ; CHECK-NEXT: undef %82.sub_32bit:gr64_with_sub_8bit = COPY $edx + ; CHECK-NEXT: undef %84.sub_32bit:gr64_nosp = COPY $esi + ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0 + ; CHECK-NEXT: VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.0, align 4) + ; CHECK-NEXT: MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4) + ; CHECK-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.4, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.4, align 8) + ; CHECK-NEXT: [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.5, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.5, align 16) + ; CHECK-NEXT: [[LEA64_32r:%[0-9]+]]:gr32 = LEA64_32r %82, 1, $noreg, 63, $noreg + ; CHECK-NEXT: TEST32rr %82.sub_32bit, %82.sub_32bit, implicit-def $eflags + ; CHECK-NEXT: [[CMOV32rr:%[0-9]+]]:gr32 = CMOV32rr [[CMOV32rr]], %82.sub_32bit, 9, implicit killed $eflags + ; CHECK-NEXT: CMP32rr [[MOV32rm1]], [[MOV32rm]], implicit-def $eflags + ; CHECK-NEXT: JCC_1 %bb.4, 13, implicit killed $eflags + ; CHECK-NEXT: JMP_1 %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.for.cond14.preheader.lr.ph: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: undef %88.sub_32bit:gr64_nosp = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.0, align 8) + ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 16, $noreg, %88.sub_16bit :: (store (s512) into %stack.0 + 16, align 4) + ; CHECK-NEXT: [[MOV32rm2:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.3, align 16) + ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 49, $noreg, [[MOV32rm2]].sub_8bit :: (store (s512) into %stack.0 + 49, align 1, basealign 4) + ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 48, $noreg, [[MOV32rm2]].sub_8bit :: (store (s512) into %stack.0 + 48, align 4) + ; CHECK-NEXT: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[AND32ri8_]], -64, implicit-def dead $eflags + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY %82.sub_32bit + ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 18, $noreg, [[COPY2]].sub_16bit :: (store (s512) into %stack.0 + 18, align 2, basealign 4) + ; CHECK-NEXT: [[SUB32rr:%[0-9]+]]:gr32 = SUB32rr [[SUB32rr]], [[AND32ri8_]], implicit-def dead $eflags + ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 18, $noreg, [[SUB32rr]].sub_16bit :: (store (s512) into %stack.0 + 18, align 2, basealign 4) + ; CHECK-NEXT: [[MOVZX32rr16_:%[0-9]+]]:gr32 = MOVZX32rr16 [[SUB32rr]].sub_16bit + ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 50, $noreg, [[MOVZX32rr16_]].sub_8bit :: (store (s512) into %stack.0 + 50, align 2, basealign 4) + ; CHECK-NEXT: [[SHR32ri:%[0-9]+]]:gr32 = SHR32ri [[SHR32ri]], 2, implicit-def dead $eflags + ; CHECK-NEXT: MOV32mr %stack.2, 1, $noreg, 0, $noreg, [[SHR32ri]] :: (store (s32) into %stack.2) + ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 50, $noreg, [[SHR32ri]].sub_8bit :: (store (s512) into %stack.0 + 50, align 2, basealign 4) + ; CHECK-NEXT: [[LEA64_32r:%[0-9]+]]:gr32 = LEA64_32r $noreg, 4, %88, 0, $noreg + ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 20, $noreg, [[LEA64_32r]].sub_16bit :: (store (s512) into %stack.0 + 20, align 4) + ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4) + ; CHECK-NEXT: [[MOVSX64rr32_:%[0-9]+]]:gr64_nosp = MOVSX64rr32 %82.sub_32bit + ; CHECK-NEXT: %82.sub_32bit:gr64_with_sub_8bit = nsw SUB32rr %82.sub_32bit, [[SUB32rr]], implicit-def dead $eflags + ; CHECK-NEXT: undef %102.sub_32bit:gr64_with_sub_8bit = MOVZX32rr16 %82.sub_16bit + ; CHECK-NEXT: MOV64mr %stack.3, 1, $noreg, 0, $noreg, %102 :: (store (s64) into %stack.3) + ; CHECK-NEXT: undef %61.sub_32bit:gr64_with_sub_8bit = COPY %102.sub_32bit + ; CHECK-NEXT: %61.sub_32bit:gr64_with_sub_8bit = IMUL32rr %61.sub_32bit, %84.sub_32bit, implicit-def dead $eflags + ; CHECK-NEXT: [[LEA64_32r1:%[0-9]+]]:gr32 = LEA64_32r $noreg, 4, %84, 0, $noreg + ; CHECK-NEXT: [[MOVSX64rr32_1:%[0-9]+]]:gr64 = MOVSX64rr32 [[LEA64_32r1]] + ; CHECK-NEXT: MOV64mr %stack.4, 1, $noreg, 0, $noreg, [[MOVSX64rr32_1]] :: (store (s64) into %stack.4) + ; CHECK-NEXT: [[MOVSX64rr32_2:%[0-9]+]]:gr64_nosp = MOVSX64rr32 %84.sub_32bit + ; CHECK-NEXT: [[MOVSX64rm32_:%[0-9]+]]:gr64_nosp = MOVSX64rm32 %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 8) + ; CHECK-NEXT: [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = MOVSX64rr32 %88.sub_32bit + ; CHECK-NEXT: [[MOVSX64rm32_1:%[0-9]+]]:gr64 = MOVSX64rm32 %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1, align 16) + ; CHECK-NEXT: [[MOVSX64rr32_4:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm1]] + ; CHECK-NEXT: [[MOVSX64rr32_5:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm2]] + ; CHECK-NEXT: [[MOVSX64rr32_6:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm]] + ; CHECK-NEXT: MOV64mr %stack.8, 1, $noreg, 0, $noreg, [[MOVSX64rr32_6]] :: (store (s64) into %stack.8) + ; CHECK-NEXT: MOV64mr %stack.6, 1, $noreg, 0, $noreg, [[MOVSX64rr32_4]] :: (store (s64) into %stack.6) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[MOVSX64rr32_4]] + ; CHECK-NEXT: [[IMUL64rr:%[0-9]+]]:gr64_nosp = IMUL64rr [[IMUL64rr]], [[MOVSX64rr32_2]], implicit-def dead $eflags + ; CHECK-NEXT: [[ADD64rr:%[0-9]+]]:gr64_nosp = ADD64rr [[ADD64rr]], [[MOVSX64rm32_]], implicit-def dead $eflags + ; CHECK-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r [[COPY]], 4, [[ADD64rr]], 0, $noreg + ; CHECK-NEXT: MOV64mr %stack.9, 1, $noreg, 0, $noreg, [[LEA64r]] :: (store (s64) into %stack.9) + ; CHECK-NEXT: MOV64mr %stack.7, 1, $noreg, 0, $noreg, [[MOVSX64rr32_5]] :: (store (s64) into %stack.7) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_5]] + ; CHECK-NEXT: [[IMUL64rr:%[0-9]+]]:gr64 = IMUL64rr [[IMUL64rr]], [[MOVSX64rr32_2]], implicit-def dead $eflags + ; CHECK-NEXT: [[SHL64ri:%[0-9]+]]:gr64 = SHL64ri [[SHL64ri]], 2, implicit-def dead $eflags + ; CHECK-NEXT: MOV64mr %stack.10, 1, $noreg, 0, $noreg, [[SHL64ri]] :: (store (s64) into %stack.10) + ; CHECK-NEXT: [[LEA64r1:%[0-9]+]]:gr64 = LEA64r $noreg, 4, [[MOVSX64rr32_3]], 0, $noreg + ; CHECK-NEXT: MOV64mr %stack.5, 1, $noreg, 0, $noreg, [[MOVSX64rm32_]] :: (store (s64) into %stack.5) + ; CHECK-NEXT: [[LEA64_32r2:%[0-9]+]]:gr32 = LEA64_32r %61, 4, [[MOVSX64rm32_]], 0, $noreg + ; CHECK-NEXT: MOV32mr %stack.11, 1, $noreg, 0, $noreg, [[LEA64_32r2]] :: (store (s32) into %stack.11) + ; CHECK-NEXT: MOV64mr %stack.13, 1, $noreg, 0, $noreg, [[LEA64r1]] :: (store (s64) into %stack.13) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.for.cond14.preheader: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[MOV32rm3:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 8) + ; CHECK-NEXT: CMP32rm [[MOV32rm3]], %fixed-stack.1, 1, $noreg, 0, $noreg, implicit-def $eflags :: (load (s32) from %fixed-stack.1, align 16) + ; CHECK-NEXT: JCC_1 %bb.5, 13, implicit killed $eflags + ; CHECK-NEXT: JMP_1 %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.for.body17.lr.ph: + ; CHECK-NEXT: successors: %bb.6(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6) + ; CHECK-NEXT: [[IMUL64rr:%[0-9]+]]:gr64 = nsw IMUL64rr [[IMUL64rr]], [[MOVSX64rr32_]], implicit-def dead $eflags + ; CHECK-NEXT: [[ADD64rm:%[0-9]+]]:gr64 = ADD64rm [[ADD64rm]], %stack.3, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.3) + ; CHECK-NEXT: [[ADD64rm1:%[0-9]+]]:gr64 = ADD64rm [[ADD64rm1]], %stack.1, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.1) + ; CHECK-NEXT: MOV64mr %stack.12, 1, $noreg, 0, $noreg, [[ADD64rm1]] :: (store (s64) into %stack.12) + ; CHECK-NEXT: [[MOV32rm4:%[0-9]+]]:gr32 = MOV32rm %stack.11, 1, $noreg, 0, $noreg :: (load (s32) from %stack.11) + ; CHECK-NEXT: undef %68.sub_32bit:gr64_nosp = COPY [[MOV32rm4]] + ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm %stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %stack.9) + ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5) + ; CHECK-NEXT: JMP_1 %bb.6 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4.for.cond.cleanup: + ; CHECK-NEXT: RET 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5.for.cond.cleanup16: + ; CHECK-NEXT: successors: %bb.2(0x7c000000), %bb.4(0x04000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6) + ; CHECK-NEXT: [[ADD64rm1:%[0-9]+]]:gr64 = ADD64rm [[ADD64rm1]], %stack.7, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.7) + ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = MOV64rm %stack.10, 1, $noreg, 0, $noreg :: (load (s64) from %stack.10) + ; CHECK-NEXT: ADD64mr %stack.9, 1, $noreg, 0, $noreg, [[MOV64rm2]], implicit-def dead $eflags :: (store (s64) into %stack.9) + ; CHECK-NEXT: MOV64mr %stack.6, 1, $noreg, 0, $noreg, [[ADD64rm1]] :: (store (s64) into %stack.6) + ; CHECK-NEXT: CMP64rm [[ADD64rm1]], %stack.8, 1, $noreg, 0, $noreg, implicit-def $eflags :: (load (s64) from %stack.8) + ; CHECK-NEXT: JCC_1 %bb.2, 12, implicit killed $eflags + ; CHECK-NEXT: JMP_1 %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.6.for.body17: + ; CHECK-NEXT: successors: %bb.6(0x7c000000), %bb.5(0x04000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[MOV32rm2]].sub_16bit, %88.sub_16bit + ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = MOV64rm %stack.12, 1, $noreg, 0, $noreg :: (load (s64) from %stack.12) + ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm2]].sub_16bit, [[SUB32rr]].sub_16bit, [[MOV64rm3]], 1, [[MOVSX64rr32_]], 0, $noreg + ; CHECK-NEXT: [[MOVSX64rr32_7:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[MOVSX64rr32_7]].sub_32bit + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY %88 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_2]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_3]] + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64 = COPY [[MOVSX64rm32_1]] + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr32 = COPY [[MOV32rm2]] + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gr32 = COPY [[SUB32rr]] + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64 = COPY [[COPY1]] + ; CHECK-NEXT: [[LEA64r2:%[0-9]+]]:gr64 = LEA64r [[COPY9]], 1, [[MOVSX64rr32_7]], 0, $noreg + ; CHECK-NEXT: [[MOV64rm4:%[0-9]+]]:gr64_nosp = MOV64rm %stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %stack.4) + ; Check LEA64_32r register is split to COPY10 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gr32 = COPY [[LEA64_32r]] + ; CHECK-NEXT: [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm %stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %stack.2) + ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm5]].sub_16bit, [[COPY10]].sub_16bit, [[LEA64r2]], 1, [[MOV64rm4]], 0, $noreg + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gr32 = COPY [[COPY10]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gr64 = COPY [[COPY9]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gr32 = COPY [[COPY8]] + ; CHECK-NEXT: [[MOV64rm5:%[0-9]+]]:gr64 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load (s64) from %stack.13) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gr32 = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:gr64 = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gr64_nosp = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:gr64_nosp = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:gr64_nosp = COPY [[COPY3]] + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:gr64_nosp = COPY [[COPY2]] + ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[COPY13]].sub_16bit, [[COPY11]].sub_16bit, [[COPY12]].sub_16bit, [[PTDPBSSDV]], [[PTILELOADDV]], [[PTILELOADDV1]] + ; CHECK-NEXT: PTILESTOREDV [[COPY13]].sub_16bit, [[COPY18]].sub_16bit, [[MOV64rm]], 1, [[COPY16]], 0, $noreg, [[PTDPBSSDV]] + ; CHECK-NEXT: [[ADD64rr1:%[0-9]+]]:gr64 = ADD64rr [[ADD64rr1]], [[COPY15]], implicit-def dead $eflags + ; CHECK-NEXT: [[ADD64rr2:%[0-9]+]]:gr64 = ADD64rr [[ADD64rr2]], [[MOV64rm5]], implicit-def dead $eflags + ; CHECK-NEXT: [[MOVSX64rr32_7]].sub_32bit:gr64_nosp = ADD32rr [[MOVSX64rr32_7]].sub_32bit, [[COPY11]], implicit-def dead $eflags + ; CHECK-NEXT: CMP64rr [[ADD64rr1]], [[COPY14]], implicit-def $eflags + ; CHECK-NEXT: JCC_1 %bb.6, 12, implicit killed $eflags + ; CHECK-NEXT: JMP_1 %bb.5 +entry: + %rem = srem i32 %K, 64 + %conv3 = trunc i32 %rem to i16 + %conv4 = trunc i32 %c_row_tile to i16 + %conv5 = trunc i32 %c_col_tile to i16 + %0 = lshr i16 %conv3, 2 + %conv13 = shl i16 %conv5, 2 + %cmp83 = icmp slt i32 %c_row_from, %c_row_to + br i1 %cmp83, label %for.cond14.preheader.lr.ph, label %for.cond.cleanup + +for.cond14.preheader.lr.ph: ; preds = %entry + %sub = sub nsw i32 %K, %rem + %conv1 = and i32 %sub, 65535 + %cmp1581 = icmp slt i32 %c_col_from, %c_col_to + %conv20 = sext i32 %K to i64 + %mul22 = mul nsw i32 %conv1, %N + %mul27 = shl nsw i32 %N, 2 + %conv28 = sext i32 %mul27 to i64 + %conv34 = sext i32 %N to i64 + %1 = sext i32 %c_col_from to i64 + %2 = sext i32 %c_col_tile to i64 + %3 = sext i32 %c_col_to to i64 + %4 = sext i32 %c_row_from to i64 + %5 = sext i32 %c_row_tile to i64 + %6 = zext i32 %conv1 to i64 + %7 = sext i32 %c_row_to to i64 + br label %for.cond14.preheader + +for.cond14.preheader: ; preds = %for.cond.cleanup16, %for.cond14.preheader.lr.ph + %indvars.iv87 = phi i64 [ %4, %for.cond14.preheader.lr.ph ], [ %indvars.iv.next88, %for.cond.cleanup16 ] + br i1 %cmp1581, label %for.body17.lr.ph, label %for.cond.cleanup16 + +for.body17.lr.ph: ; preds = %for.cond14.preheader + %8 = mul nsw i64 %indvars.iv87, %conv20 + %9 = add nsw i64 %8, %6 + %arrayidx = getelementptr inbounds i8, ptr %A, i64 %9 + %10 = mul nsw i64 %indvars.iv87, %conv34 + br label %for.body17 + +for.cond.cleanup: ; preds = %for.cond.cleanup16, %entry + ret void + +for.cond.cleanup16: ; preds = %for.body17, %for.cond14.preheader + %indvars.iv.next88 = add i64 %indvars.iv87, %5 + %cmp = icmp slt i64 %indvars.iv.next88, %7 + br i1 %cmp, label %for.cond14.preheader, label %for.cond.cleanup + +for.body17: ; preds = %for.body17, %for.body17.lr.ph + %indvars.iv = phi i64 [ %1, %for.body17.lr.ph ], [ %indvars.iv.next, %for.body17 ] + %11 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %conv4, i16 %conv5) + %12 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %conv4, i16 %conv3, ptr %arrayidx, i64 %conv20) + %13 = trunc i64 %indvars.iv to i32 + %mul23 = shl nsw i32 %13, 2 + %add24 = add nsw i32 %mul23, %mul22 + %idxprom25 = sext i32 %add24 to i64 + %arrayidx26 = getelementptr inbounds i8, ptr %B_rcr4, i64 %idxprom25 + %14 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %conv13, ptr %arrayidx26, i64 %conv28) + %15 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %conv4, i16 %conv13, i16 %conv3, x86_amx %11, x86_amx %12, x86_amx %14) + %16 = add nsw i64 %indvars.iv, %10 + %arrayidx33 = getelementptr inbounds i32, ptr %C, i64 %16 + tail call void @llvm.x86.tilestored64.internal(i16 %conv4, i16 %conv5, ptr %arrayidx33, i64 %conv34, x86_amx %15) + %indvars.iv.next = add i64 %indvars.iv, %2 + %cmp15 = icmp slt i64 %indvars.iv.next, %3 + br i1 %cmp15, label %for.body17, label %for.cond.cleanup16 +} + +declare x86_amx @llvm.x86.tilezero.internal(i16, i16) +declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) +declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) diff --git a/llvm/test/CodeGen/X86/AMX/amx-greedy-ra.ll b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra.ll @@ -0,0 +1,40 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs -stop-after tileconfig | FileCheck %s + +; Test the tile register is allocated in a seperate pass. + +define i16 @foo(i32 noundef %t, i16 %row, i16 %col) nounwind { + ; CHECK-LABEL: name: foo + ; CHECK: bb.0.entry: + ; CHECK-NEXT: liveins: $esi, $edx + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: undef %12.sub_32bit:gr64_nosp = COPY $edx + ; CHECK-NEXT: undef %13.sub_32bit:gr64_with_sub_8bit = COPY $esi + ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0 + ; CHECK-NEXT: VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.0, align 4) + ; CHECK-NEXT: MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4) + ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 20, $noreg, %12.sub_16bit :: (store (s512) into %stack.0 + 20, align 4) + ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 50, $noreg, %13.sub_8bit :: (store (s512) into %stack.0 + 50, align 2, basealign 4) + ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 18, $noreg, %12.sub_16bit :: (store (s512) into %stack.0 + 18, align 2, basealign 4) + ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 49, $noreg, %13.sub_8bit :: (store (s512) into %stack.0 + 49, align 1, basealign 4) + ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 16, $noreg, %12.sub_16bit :: (store (s512) into %stack.0 + 16, align 4) + ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 48, $noreg, %13.sub_8bit :: (store (s512) into %stack.0 + 48, align 4) + ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4) + ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV %13.sub_16bit, %12.sub_16bit + ; CHECK-NEXT: [[PTILEZEROV1:%[0-9]+]]:tile = PTILEZEROV %13.sub_16bit, %12.sub_16bit + ; CHECK-NEXT: [[PTILEZEROV2:%[0-9]+]]:tile = PTILEZEROV %13.sub_16bit, %12.sub_16bit + ; CHECK-NEXT: dead [[PTILEZEROV2]]:tile = PTDPBSSDV %13.sub_16bit, %12.sub_16bit, %12.sub_16bit, [[PTILEZEROV2]], [[PTILEZEROV]], [[PTILEZEROV1]] + ; CHECK-NEXT: [[LEA64_32r:%[0-9]+]]:gr32 = LEA64_32r %13, 1, %12, 0, $noreg + ; CHECK-NEXT: $ax = COPY [[LEA64_32r]].sub_16bit + ; CHECK-NEXT: RET 0, killed $ax +entry: + %0 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col) + %1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col) + %2 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col) + %3 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 %col, x86_amx %2, x86_amx %0, x86_amx %1) + %4 = add i16 %row, %col + ret i16 %4 +} + +declare x86_amx @llvm.x86.tilezero.internal(i16, i16) +declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) diff --git a/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll b/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll --- a/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll @@ -106,14 +106,14 @@ ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, %r14w +; CHECK-NEXT: movw $8, %bp ; CHECK-NEXT: tilezero %tmm0 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB1_3 ; CHECK-NEXT: # %bb.1: # %loop.header.preheader -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: xorl %ebp, %ebp +; CHECK-NEXT: movq %rdi, %r14 +; CHECK-NEXT: xorl %ebx, %ebx ; CHECK-NEXT: movl $32, %r15d ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB1_2: # %loop.header @@ -123,12 +123,12 @@ ; CHECK-NEXT: callq foo ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: tilezero %tmm2 -; CHECK-NEXT: tileloadd (%rbx,%r15), %tmm0 -; CHECK-NEXT: tileloadd (%rbx,%r15), %tmm1 +; CHECK-NEXT: tileloadd (%r14,%r15), %tmm0 +; CHECK-NEXT: tileloadd (%r14,%r15), %tmm1 ; CHECK-NEXT: tdpbssd %tmm1, %tmm0, %tmm2 -; CHECK-NEXT: tilestored %tmm2, (%rbx,%r15) -; CHECK-NEXT: incl %ebp -; CHECK-NEXT: cmpw $100, %bp +; CHECK-NEXT: tilestored %tmm2, (%r14,%r15) +; CHECK-NEXT: incl %ebx +; CHECK-NEXT: cmpw $100, %bx ; CHECK-NEXT: jl .LBB1_2 ; CHECK-NEXT: .LBB1_3: # %exit ; CHECK-NEXT: addq $72, %rsp diff --git a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll --- a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll @@ -131,32 +131,32 @@ ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, %r15w +; CHECK-NEXT: movw $8, %bp ; CHECK-NEXT: tilezero %tmm0 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB1_3 ; CHECK-NEXT: # %bb.1: # %loop.header.preheader -; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: movq %rdi, %r15 ; CHECK-NEXT: movl $32, %r14d -; CHECK-NEXT: xorl %ebp, %ebp +; CHECK-NEXT: xorl %ebx, %ebx ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB1_2: # %loop.header ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: tilestored %tmm0, (%rbx,%r14) +; CHECK-NEXT: tilestored %tmm0, (%r15,%r14) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: tilezero %tmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: tilezero %tmm0 -; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm1 -; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm2 +; CHECK-NEXT: tileloadd (%r15,%r14), %tmm1 +; CHECK-NEXT: tileloadd (%r15,%r14), %tmm2 ; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 -; CHECK-NEXT: tilestored %tmm0, (%rbx,%r14) +; CHECK-NEXT: tilestored %tmm0, (%r15,%r14) ; CHECK-NEXT: tilezero %tmm0 -; CHECK-NEXT: incl %ebp -; CHECK-NEXT: cmpw $100, %bp +; CHECK-NEXT: incl %ebx +; CHECK-NEXT: cmpw $100, %bx ; CHECK-NEXT: jl .LBB1_2 ; CHECK-NEXT: .LBB1_3: # %exit ; CHECK-NEXT: addq $72, %rsp diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -146,6 +146,7 @@ ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Greedy Register Allocator ; CHECK-NEXT: Tile Register Configure +; CHECK-NEXT: Greedy Register Allocator ; CHECK-NEXT: Virtual Register Rewriter ; CHECK-NEXT: Register Allocation Pass Scoring ; CHECK-NEXT: Stack Slot Coloring diff --git a/llvm/test/CodeGen/X86/statepoint-ra.ll b/llvm/test/CodeGen/X86/statepoint-ra.ll --- a/llvm/test/CodeGen/X86/statepoint-ra.ll +++ b/llvm/test/CodeGen/X86/statepoint-ra.ll @@ -1,4 +1,4 @@ -; RUN: llc -verify-machineinstrs -O3 -use-registers-for-deopt-values -restrict-statepoint-remat=true -pass-remarks-filter=regalloc -pass-remarks-output=%t.yaml -stop-after=greedy -o - < %s 2>&1 | FileCheck %s +; RUN: llc -x86-tile-ra=0 -verify-machineinstrs -O3 -use-registers-for-deopt-values -restrict-statepoint-remat=true -pass-remarks-filter=regalloc -pass-remarks-output=%t.yaml -stop-after=greedy -o - < %s 2>&1 | FileCheck %s ; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s target triple = "x86_64-unknown-linux-gnu"