diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp --- a/llvm/lib/CodeGen/RegAllocFast.cpp +++ b/llvm/lib/CodeGen/RegAllocFast.cpp @@ -281,6 +281,7 @@ Register traceCopies(Register VirtReg) const; Register traceCopyChain(Register Reg) const; + bool ShouldAllocateRegister(const Register Reg) const; int getStackSpaceFor(Register VirtReg); void spill(MachineBasicBlock::iterator Before, Register VirtReg, MCPhysReg AssignedReg, bool Kill, bool LiveOut); @@ -300,6 +301,11 @@ INITIALIZE_PASS(RegAllocFast, "regallocfast", "Fast Register Allocator", false, false) +bool RegAllocFast::ShouldAllocateRegister(const Register Reg) const { + const TargetRegisterClass &RC = *MRI->getRegClass(Reg); + return ShouldAllocateClass(*TRI, RC); +} + void RegAllocFast::setPhysRegState(MCPhysReg PhysReg, unsigned NewState) { for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI) RegUnitStates[*UI] = NewState; @@ -1071,6 +1077,8 @@ assert(RegClassDefCounts.size() == TRI->getNumRegClasses()); if (Reg.isVirtual()) { + if (ShouldAllocateRegister(Reg)) + return; const TargetRegisterClass *OpRC = MRI->getRegClass(Reg); for (unsigned RCIdx = 0, RCIdxEnd = TRI->getNumRegClasses(); RCIdx != RCIdxEnd; ++RCIdx) { @@ -1130,6 +1138,8 @@ if (MO.isReg()) { Register Reg = MO.getReg(); if (Reg.isVirtual()) { + if (!ShouldAllocateRegister(Reg)) + continue; if (MO.isDef()) { HasDef = true; HasVRegDef = true; @@ -1193,7 +1203,7 @@ } if (MO.isDef()) { - if (Reg.isVirtual()) + if (Reg.isVirtual() && ShouldAllocateRegister(Reg)) DefOperandIndexes.push_back(I); addRegClassDefCounts(RegClassDefCounts, Reg); @@ -1253,7 +1263,7 @@ if (!MO.isReg() || !MO.isDef()) continue; Register Reg = MO.getReg(); - if (Reg.isVirtual()) + if (Reg.isVirtual() && ShouldAllocateRegister(Reg)) defineVirtReg(MI, I, Reg); } } @@ -1283,6 +1293,8 @@ Register Reg = MO.getReg(); if (!Reg) continue; + if (Reg.isVirtual() && !ShouldAllocateRegister(Reg)) + continue; assert(Reg.isPhysical()); if (MRI->isReserved(Reg)) continue; @@ -1329,7 +1341,7 @@ if (!MO.isReg() || !MO.isUse()) continue; Register Reg = MO.getReg(); - if (!Reg.isVirtual()) + if (!Reg.isVirtual() || !ShouldAllocateRegister(Reg)) continue; if (MO.isUndef()) { @@ -1356,7 +1368,7 @@ if (!MO.isReg() || !MO.isUse()) continue; Register Reg = MO.getReg(); - if (!Reg.isVirtual()) + if (!Reg.isVirtual() || !ShouldAllocateRegister(Reg)) continue; assert(MO.isUndef() && "Should only have undef virtreg uses left"); @@ -1448,7 +1460,7 @@ continue; Register Reg = MO.getReg(); - if (!Reg.isVirtual()) + if (!Reg.isVirtual() || !ShouldAllocateRegister(Reg)) continue; DenseMap::iterator DI; diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp --- a/llvm/lib/Target/X86/X86LowerAMXType.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -1207,7 +1207,7 @@ // Prepare for fast register allocation at O0. // Todo: May better check the volatile model of AMX code, not just // by checking Attribute::OptimizeNone and CodeGenOpt::None. - if (TM->getOptLevel() == CodeGenOpt::None) { + if (0) { // (TM->getOptLevel() == CodeGenOpt::None) { // If Front End not use O0 but the Mid/Back end use O0, (e.g. // "Clang -O2 -S -emit-llvm t.c" + "llc t.ll") we should make // sure the amx data is volatile, that is nessary for AMX fast diff --git a/llvm/lib/Target/X86/X86RegisterInfo.h b/llvm/lib/Target/X86/X86RegisterInfo.h --- a/llvm/lib/Target/X86/X86RegisterInfo.h +++ b/llvm/lib/Target/X86/X86RegisterInfo.h @@ -120,6 +120,8 @@ bool isArgumentRegister(const MachineFunction &MF, MCRegister Reg) const override; + bool isTileRegisterClass(const TargetRegisterClass *RC) const; + /// Returns true if PhysReg is a fixed register. bool isFixedRegister(const MachineFunction &MF, MCRegister PhysReg) const override; diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -676,6 +676,10 @@ return X86GenRegisterInfo::isFixedRegister(MF, PhysReg); } +bool X86RegisterInfo::isTileRegisterClass(const TargetRegisterClass *RC) const { + return RC->getID() == X86::TILERegClassID; +} + void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const { // Check if the EFLAGS register is marked as live-out. This shouldn't happen, // because the calling convention defines the EFLAGS register as NOT diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -387,6 +387,7 @@ void addPreEmitPass2() override; void addPreSched2() override; bool addPreRewrite() override; + bool addRegAssignAndRewriteFast() override; std::unique_ptr getCSEConfig() const override; }; @@ -610,6 +611,17 @@ })); } +static bool onlyAllocateTileRegisters(const TargetRegisterInfo &TRI, + const TargetRegisterClass &RC) { + return static_cast(TRI).isTileRegisterClass(&RC); +} + +bool X86PassConfig::addRegAssignAndRewriteFast() { + // Allocate AMX registers separately. + addPass(createFastRegisterAllocator(onlyAllocateTileRegisters, false)); + return TargetPassConfig::addRegAssignAndRewriteFast(); +} + bool X86PassConfig::addPostFastRegAllocRewrite() { addPass(createX86FastTileConfigPass()); return true; diff --git a/llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll b/llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll --- a/llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll @@ -7,10 +7,10 @@ define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) nounwind { ; AVX512-LABEL: test_api: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: movq %rsp, %rbp -; AVX512-NEXT: andq $-1024, %rsp # imm = 0xFC00 -; AVX512-NEXT: subq $8192, %rsp # imm = 0x2000 +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $9056, %rsp # imm = 0x2360 ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) @@ -18,127 +18,121 @@ ; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512-NEXT: movw %si, %ax ; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512-NEXT: cmpl $0, %edi ; AVX512-NEXT: je .LBB0_2 ; AVX512-NEXT: # %bb.1: # %if.then -; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload -; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: movl $buf, %r9d +; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Reload +; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %r8w # 2-byte Reload +; AVX512-NEXT: movl $buf, %eax +; AVX512-NEXT: movl $32, %ecx +; AVX512-NEXT: movw $8, %r9w +; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %r9w, {{[0-9]+}}(%rsp) +; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; AVX512-NEXT: tileloadd (%rax,%rcx), %tmm0 +; AVX512-NEXT: movabsq $64, %rax +; AVX512-NEXT: tilestored %tmm0, 3968(%rsp,%rax) # 1024-byte Folded Spill +; AVX512-NEXT: movl $buf, %eax ; AVX512-NEXT: movl $32, %r10d -; AVX512-NEXT: movw $8, %si -; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw $8, %dx +; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %dl, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp) ; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX512-NEXT: tileloadd (%r9,%r10), %tmm0 -; AVX512-NEXT: movl $64, %r9d -; AVX512-NEXT: movw $8, %si -; AVX512-NEXT: tilestored %tmm0, (%r8,%r9) -; AVX512-NEXT: movl $buf, %r8d -; AVX512-NEXT: movl $32, %r9d -; AVX512-NEXT: movw $8, %si -; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX512-NEXT: tileloadd (%r8,%r9), %tmm0 -; AVX512-NEXT: movl $64, %r8d -; AVX512-NEXT: movw $8, %si -; AVX512-NEXT: tilestored %tmm0, (%rdi,%r8) -; AVX512-NEXT: movl $buf, %esi -; AVX512-NEXT: movl $32, %edi -; AVX512-NEXT: tileloadd (%rsi,%rdi), %tmm0 -; AVX512-NEXT: movl $64, %esi -; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) +; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: tileloadd (%rax,%r10), %tmm0 +; AVX512-NEXT: movabsq $64, %rax +; AVX512-NEXT: tilestored %tmm0, 6016(%rsp,%rax) # 1024-byte Folded Spill +; AVX512-NEXT: movl $buf, %r10d +; AVX512-NEXT: movl $32, %r11d +; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: tileloadd (%r10,%r11), %tmm0 +; AVX512-NEXT: movabsq $64, %r10 +; AVX512-NEXT: tilestored %tmm0, 7040(%rsp,%r10) # 1024-byte Folded Spill +; AVX512-NEXT: movw %r9w, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512-NEXT: movw %r8w, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movw %si, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512-NEXT: movw %dx, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512-NEXT: jmp .LBB0_3 ; AVX512-NEXT: .LBB0_2: # %if.else -; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload -; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: movl $buf2, %r9d +; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Reload +; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %r8w # 2-byte Reload +; AVX512-NEXT: movl $buf2, %eax +; AVX512-NEXT: movl $32, %ecx +; AVX512-NEXT: movw $8, %r9w +; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %r9w, {{[0-9]+}}(%rsp) +; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; AVX512-NEXT: tileloadd (%rax,%rcx), %tmm0 +; AVX512-NEXT: movabsq $64, %rax +; AVX512-NEXT: tilestored %tmm0, 896(%rsp,%rax) # 1024-byte Folded Spill +; AVX512-NEXT: movl $buf2, %eax ; AVX512-NEXT: movl $32, %r10d -; AVX512-NEXT: movw $8, %si -; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw $8, %dx +; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %dl, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp) ; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX512-NEXT: tileloadd (%r9,%r10), %tmm0 -; AVX512-NEXT: movl $64, %r9d -; AVX512-NEXT: movw $8, %si -; AVX512-NEXT: tilestored %tmm0, (%r8,%r9) -; AVX512-NEXT: movl $buf2, %r8d -; AVX512-NEXT: movl $32, %r9d -; AVX512-NEXT: movw $8, %si -; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX512-NEXT: tileloadd (%r8,%r9), %tmm0 -; AVX512-NEXT: movl $64, %r8d -; AVX512-NEXT: movw $8, %si -; AVX512-NEXT: tilestored %tmm0, (%rdi,%r8) -; AVX512-NEXT: movl $buf2, %esi -; AVX512-NEXT: movl $32, %edi -; AVX512-NEXT: tileloadd (%rsi,%rdi), %tmm0 -; AVX512-NEXT: movl $64, %esi -; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) +; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: tileloadd (%rax,%r10), %tmm0 +; AVX512-NEXT: movabsq $64, %rax +; AVX512-NEXT: tilestored %tmm0, 1920(%rsp,%rax) # 1024-byte Folded Spill +; AVX512-NEXT: movl $buf2, %r10d +; AVX512-NEXT: movl $32, %r11d +; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: tileloadd (%r10,%r11), %tmm0 +; AVX512-NEXT: movabsq $64, %r10 +; AVX512-NEXT: tilestored %tmm0, 2944(%rsp,%r10) # 1024-byte Folded Spill +; AVX512-NEXT: movw %r9w, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512-NEXT: movw %r8w, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movw %si, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512-NEXT: movw %dx, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512-NEXT: .LBB0_3: # %if.end ; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload ; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: movl $64, %r10d -; AVX512-NEXT: movw $8, %di -; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX512-NEXT: tileloadd (%r8,%r10), %tmm0 -; AVX512-NEXT: movabsq $64, %r8 -; AVX512-NEXT: tilestored %tmm0, 1024(%rsp,%r8) # 1024-byte Folded Spill -; AVX512-NEXT: movl $64, %r10d -; AVX512-NEXT: movw $8, %r8w -; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %bx # 2-byte Reload +; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %r11w # 2-byte Reload +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %r9w # 2-byte Reload +; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %r8w # 2-byte Reload +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; AVX512-NEXT: movw %r11w, %dx +; AVX512-NEXT: movw %r9w, %si ; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %dl, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp) ; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %r9w, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movb %r11b, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movw %bx, {{[0-9]+}}(%rsp) ; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX512-NEXT: tileloadd (%r9,%r10), %tmm2 -; AVX512-NEXT: movl $64, %r8d -; AVX512-NEXT: tileloadd (%rsi,%r8), %tmm0 -; AVX512-NEXT: movw $8, %si +; AVX512-NEXT: movabsq $64, %r15 +; AVX512-NEXT: tileloadd (%r14,%r15), %tmm1 +; AVX512-NEXT: movabsq $64, %r11 +; AVX512-NEXT: tileloadd (%r10,%r11), %tmm2 ; AVX512-NEXT: movabsq $64, %r8 -; AVX512-NEXT: tileloadd 1024(%rsp,%r8), %tmm1 # 1024-byte Folded Reload +; AVX512-NEXT: tileloadd (%rdi,%r8), %tmm0 +; AVX512-NEXT: movw $8, %dx ; AVX512-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 -; AVX512-NEXT: movl $64, %esi -; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX512-NEXT: movl $64, %esi -; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; AVX512-NEXT: movl $buf, %edx ; AVX512-NEXT: movl $32, %esi ; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX512-NEXT: movq %rbp, %rsp -; AVX512-NEXT: popq %rbp +; AVX512-NEXT: addq $9056, %rsp # imm = 0x2360 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 ; AVX512-NEXT: tilerelease ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/AMX/amx-zero-config.ll b/llvm/test/CodeGen/X86/AMX/amx-zero-config.ll --- a/llvm/test/CodeGen/X86/AMX/amx-zero-config.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-zero-config.ll @@ -63,115 +63,61 @@ ; ; AVX512-O0-LABEL: foo: ; AVX512-O0: # %bb.0: # %entry -; AVX512-O0-NEXT: pushq %rbp -; AVX512-O0-NEXT: movq %rsp, %rbp -; AVX512-O0-NEXT: andq $-1024, %rsp # imm = 0xFC00 -; AVX512-O0-NEXT: subq $3072, %rsp # imm = 0xC00 ; AVX512-O0-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-O0-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) -; AVX512-O0-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; AVX512-O0-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; AVX512-O0-NEXT: movb $1, -{{[0-9]+}}(%rsp) ; AVX512-O0-NEXT: movw $32, %cx ; AVX512-O0-NEXT: movw $8, %ax -; AVX512-O0-NEXT: movb %al, {{[0-9]+}}(%rsp) -; AVX512-O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AVX512-O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-O0-NEXT: movw %cx, -{{[0-9]+}}(%rsp) +; AVX512-O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) ; AVX512-O0-NEXT: tilezero %tmm0 -; AVX512-O0-NEXT: movl $64, %esi -; AVX512-O0-NEXT: movw $32, %cx -; AVX512-O0-NEXT: movw $8, %ax -; AVX512-O0-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX512-O0-NEXT: movl $64, %esi -; AVX512-O0-NEXT: movw $32, %cx -; AVX512-O0-NEXT: movw $8, %ax -; AVX512-O0-NEXT: movb %al, {{[0-9]+}}(%rsp) -; AVX512-O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX512-O0-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; AVX512-O0-NEXT: movl $1024, %edx # imm = 0x400 ; AVX512-O0-NEXT: movw $32, %cx ; AVX512-O0-NEXT: movw $8, %ax ; AVX512-O0-NEXT: tilestored %tmm0, (%rdi,%rdx) -; AVX512-O0-NEXT: movq %rbp, %rsp -; AVX512-O0-NEXT: popq %rbp ; AVX512-O0-NEXT: tilerelease ; AVX512-O0-NEXT: vzeroupper ; AVX512-O0-NEXT: retq ; ; AVX2-O0-LABEL: foo: ; AVX2-O0: # %bb.0: # %entry -; AVX2-O0-NEXT: pushq %rbp -; AVX2-O0-NEXT: movq %rsp, %rbp -; AVX2-O0-NEXT: andq $-1024, %rsp # imm = 0xFC00 -; AVX2-O0-NEXT: subq $3072, %rsp # imm = 0xC00 ; AVX2-O0-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-O0-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-O0-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-O0-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; AVX2-O0-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-O0-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-O0-NEXT: movb $1, -{{[0-9]+}}(%rsp) ; AVX2-O0-NEXT: movw $32, %cx ; AVX2-O0-NEXT: movw $8, %ax -; AVX2-O0-NEXT: movb %al, {{[0-9]+}}(%rsp) -; AVX2-O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; AVX2-O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX2-O0-NEXT: movw %cx, -{{[0-9]+}}(%rsp) +; AVX2-O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) ; AVX2-O0-NEXT: tilezero %tmm0 -; AVX2-O0-NEXT: movl $64, %esi -; AVX2-O0-NEXT: movw $32, %cx -; AVX2-O0-NEXT: movw $8, %ax -; AVX2-O0-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX2-O0-NEXT: movl $64, %esi -; AVX2-O0-NEXT: movw $32, %cx -; AVX2-O0-NEXT: movw $8, %ax -; AVX2-O0-NEXT: movb %al, {{[0-9]+}}(%rsp) -; AVX2-O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX2-O0-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; AVX2-O0-NEXT: movl $1024, %edx # imm = 0x400 ; AVX2-O0-NEXT: movw $32, %cx ; AVX2-O0-NEXT: movw $8, %ax ; AVX2-O0-NEXT: tilestored %tmm0, (%rdi,%rdx) -; AVX2-O0-NEXT: movq %rbp, %rsp -; AVX2-O0-NEXT: popq %rbp ; AVX2-O0-NEXT: tilerelease ; AVX2-O0-NEXT: vzeroupper ; AVX2-O0-NEXT: retq ; ; SSE2-O0-LABEL: foo: ; SSE2-O0: # %bb.0: # %entry -; SSE2-O0-NEXT: pushq %rbp -; SSE2-O0-NEXT: movq %rsp, %rbp -; SSE2-O0-NEXT: andq $-1024, %rsp # imm = 0xFC00 -; SSE2-O0-NEXT: subq $3072, %rsp # imm = 0xC00 ; SSE2-O0-NEXT: xorps %xmm0, %xmm0 -; SSE2-O0-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-O0-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-O0-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-O0-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-O0-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; SSE2-O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-O0-NEXT: movb $1, -{{[0-9]+}}(%rsp) ; SSE2-O0-NEXT: movw $32, %cx ; SSE2-O0-NEXT: movw $8, %ax -; SSE2-O0-NEXT: movb %al, {{[0-9]+}}(%rsp) -; SSE2-O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; SSE2-O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-O0-NEXT: movw %cx, -{{[0-9]+}}(%rsp) +; SSE2-O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) ; SSE2-O0-NEXT: tilezero %tmm0 -; SSE2-O0-NEXT: movl $64, %esi -; SSE2-O0-NEXT: movw $32, %cx -; SSE2-O0-NEXT: movw $8, %ax -; SSE2-O0-NEXT: tilestored %tmm0, (%rdx,%rsi) -; SSE2-O0-NEXT: movl $64, %esi -; SSE2-O0-NEXT: movw $32, %cx -; SSE2-O0-NEXT: movw $8, %ax -; SSE2-O0-NEXT: movb %al, {{[0-9]+}}(%rsp) -; SSE2-O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; SSE2-O0-NEXT: tileloadd (%rdx,%rsi), %tmm0 ; SSE2-O0-NEXT: movl $1024, %edx # imm = 0x400 ; SSE2-O0-NEXT: movw $32, %cx ; SSE2-O0-NEXT: movw $8, %ax ; SSE2-O0-NEXT: tilestored %tmm0, (%rdi,%rdx) -; SSE2-O0-NEXT: movq %rbp, %rsp -; SSE2-O0-NEXT: popq %rbp ; SSE2-O0-NEXT: tilerelease ; SSE2-O0-NEXT: retq entry: