diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -32,6 +32,7 @@ X86CmovConversion.cpp X86DomainReassignment.cpp X86DiscriminateMemOps.cpp + X86LowerTileCopy.cpp X86LowerAMXType.cpp X86TileConfig.cpp X86PreTileConfig.cpp diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -76,10 +76,15 @@ /// Return a pass that expands WinAlloca pseudo-instructions. FunctionPass *createX86WinAllocaExpander(); +/// Return a pass that config the tile registers. FunctionPass *createX86TileConfigPass(); +/// Return a pass that insert pseudo tile config instruction. FunctionPass *createX86PreTileConfigPass(); +/// Return a pass that lower the tile copy instruction. +FunctionPass *createX86LowerTileCopyPass(); + /// Return a pass that inserts int3 at the end of the function if it ends with a /// CALL instruction. The pass does the same for each funclet as well. This /// ensures that the open interval of function start and end PCs contains all @@ -169,6 +174,7 @@ void initializeX86PreTileConfigPass(PassRegistry &); void initializeX86TileConfigPass(PassRegistry &); void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &); +void initializeX86LowerTileCopyPass(PassRegistry &); namespace X86AS { enum : unsigned { diff --git a/llvm/lib/Target/X86/X86LowerTileCopy.cpp b/llvm/lib/Target/X86/X86LowerTileCopy.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/X86/X86LowerTileCopy.cpp @@ -0,0 +1,132 @@ +//===-- X86LowerTileCopy.cpp - Expand Tile Copy Instructions---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the pass which lower AMX tile copy instructions. Since +// there is no tile copy instruction, we need store tile register to stack +// and load from stack to another tile register. We need extra GR to hold +// the stride, and we need stack slot to hold the tile data register. +// We would run this pass after copy propagation, so that we don't miss copy +// optimization. And we would run this pass before prolog/epilog insertion, +// so that we can allocate stack slot. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86-lower-tile-copy" + +namespace { + +class X86LowerTileCopy : public MachineFunctionPass { +public: + static char ID; + + X86LowerTileCopy() : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override; + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "X86 Lower Tile Copy"; } +}; + +} // namespace + +char X86LowerTileCopy::ID = 0; + +INITIALIZE_PASS_BEGIN(X86LowerTileCopy, "lowertilecopy", "Tile Copy Lowering", + false, false) +INITIALIZE_PASS_END(X86LowerTileCopy, "lowertilecopy", "Tile Copy Lowering", + false, false) + +void X86LowerTileCopy::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +FunctionPass *llvm::createX86LowerTileCopyPass() { + return new X86LowerTileCopy(); +} + +bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) { + const X86Subtarget &ST = MF.getSubtarget(); + const X86InstrInfo *TII = ST.getInstrInfo(); + bool Changed = false; + + for (MachineBasicBlock &MBB : MF) { + for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end(); + MII != MIE;) { + MachineInstr &MI = *MII++; + if (!MI.isCopy()) + continue; + MachineOperand &DstMO = MI.getOperand(0); + MachineOperand &SrcMO = MI.getOperand(1); + Register SrcReg = SrcMO.getReg(); + Register DstReg = DstMO.getReg(); + if (!X86::TILERegClass.contains(DstReg, SrcReg)) + continue; + + const TargetRegisterInfo *TRI = ST.getRegisterInfo(); + // Allocate stack slot for tile register + unsigned Size = TRI->getSpillSize(X86::TILERegClass); + Align Alignment = TRI->getSpillAlign(X86::TILERegClass); + int TileSS = MF.getFrameInfo().CreateSpillStackObject(Size, Alignment); + // Allocate stack slot for stride register + Size = TRI->getSpillSize(X86::GR64RegClass); + Alignment = TRI->getSpillAlign(X86::GR64RegClass); + int StrideSS = MF.getFrameInfo().CreateSpillStackObject(Size, Alignment); + + // TODO: Pick a killed regiter to avoid save/reload. There is problem + // to get live interval in this stage. + Register GR64Cand = X86::RAX; + + const DebugLoc &DL = MI.getDebugLoc(); + // mov %rax (%sp) + BuildMI(MBB, MI, DL, TII->get(X86::IMPLICIT_DEF), GR64Cand); + addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV64mr)), StrideSS) + .addReg(GR64Cand); + // mov 64 %rax + BuildMI(MBB, MI, DL, TII->get(X86::MOV64ri), GR64Cand).addImm(64); + // tilestored %tmm, (%sp, %idx) + unsigned Opc = X86::TILESTORED; + MachineInstr *NewMI = + addFrameReference(BuildMI(MBB, MI, DL, TII->get(Opc)), TileSS) + .addReg(SrcReg, getKillRegState(SrcMO.isKill())); + MachineOperand &MO = NewMI->getOperand(2); + MO.setReg(GR64Cand); + MO.setIsKill(true); + // tileloadd (%sp, %idx), %tmm + Opc = X86::TILELOADD; + NewMI = addFrameReference(BuildMI(MBB, MI, DL, TII->get(Opc), DstReg), + TileSS); + // restore %rax + // mov (%sp) %rax + addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm), GR64Cand), + StrideSS); + MI.eraseFromParent(); + Changed = true; + } + } + return Changed; +} diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -875,6 +875,12 @@ default: llvm_unreachable("Unexpected machine instruction on tile register!"); break; + case X86::COPY: { + Register SrcReg = MI->getOperand(1).getReg(); + ShapeT Shape = getTileShape(SrcReg, VRM, MRI); + VRM->assignVirt2Shape(VirtReg, Shape); + return Shape; + } // We only collect the tile shape that is defined. case X86::PTILELOADDV: case X86::PTDPBSSDV: diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -73,6 +73,7 @@ initializeX86CallFrameOptimizationPass(PR); initializeX86CmovConverterPassPass(PR); initializeX86TileConfigPass(PR); + initializeX86LowerTileCopyPass(PR); initializeX86ExpandPseudoPass(PR); initializeX86ExecutionDomainFixPass(PR); initializeX86DomainReassignmentPass(PR); @@ -508,6 +509,7 @@ } void X86PassConfig::addPostRegAlloc() { + addPass(createX86LowerTileCopyPass()); addPass(createX86FloatingPointStackifierPass()); // When -O0 is enabled, the Load Value Injection Hardening pass will fall back // to using the Speculative Execution Side Effect Suppression pass for diff --git a/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll b/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll @@ -0,0 +1,181 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s + +define dso_local void @test1(i8 *%buf) nounwind { +; CHECK-LABEL: test1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: subq $4056, %rsp # imm = 0xFD8 +; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $64, %eax +; CHECK-NEXT: movw $8, %r14w +; CHECK-NEXT: tileloadd (%rdi,%rax), %tmm3 +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: jne .LBB0_3 +; CHECK-NEXT: # %bb.1: # %loop.header.preheader +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: xorl %ebp, %ebp +; CHECK-NEXT: movl $32, %r15d +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB0_2: # %loop.header +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movabsq $64, %rax +; CHECK-NEXT: tilestored %tmm3, 2048(%rsp,%rax) # 1024-byte Folded Spill +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq foo +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: movabsq $64, %rax +; CHECK-NEXT: tileloadd 2048(%rsp,%rax), %tmm3 # 1024-byte Folded Reload +; CHECK-NEXT: tileloadd (%rbx,%r15), %tmm0 +; CHECK-NEXT: tileloadd (%rbx,%r15), %tmm1 +; CHECK-NEXT: # implicit-def: $rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movabsq $64, %rax +; CHECK-NEXT: tilestored %tmm3, 1024(%rsp,%rax) # 1024-byte Folded Spill +; CHECK-NEXT: tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm2 # 1024-byte Folded Reload +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; CHECK-NEXT: tdpbssd %tmm1, %tmm0, %tmm2 +; CHECK-NEXT: tilestored %tmm2, (%rbx,%r15) +; CHECK-NEXT: incl %ebp +; CHECK-NEXT: cmpw $100, %bp +; CHECK-NEXT: jl .LBB0_2 +; CHECK-NEXT: .LBB0_3: # %exit +; CHECK-NEXT: addq $4056, %rsp # imm = 0xFD8 +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: tilerelease +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %t1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %buf, i64 64) + br i1 undef, label %loop.header, label %exit + +loop.header: + %ivphi = phi i16 [0, %entry], [%iv, %loop.latch] + call void @foo() + br label %loop.body + +loop.body: + %t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %buf, i64 32) + %t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %buf, i64 32) + %t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3) + tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %buf, i64 32, x86_amx %t4) + br label %loop.latch + +loop.latch: + %iv = add i16 %ivphi, 1 + %c = icmp slt i16 %iv, 100 + br i1 %c, label %loop.header, label %exit + +exit: + ret void +} + +define dso_local void @test2(i8 *%buf) nounwind { +; CHECK-LABEL: test2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: subq $4056, %rsp # imm = 0xFD8 +; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, %r14w +; CHECK-NEXT: tilezero %tmm3 +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: jne .LBB1_3 +; CHECK-NEXT: # %bb.1: # %loop.header.preheader +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: xorl %ebp, %ebp +; CHECK-NEXT: movl $32, %r15d +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB1_2: # %loop.header +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movabsq $64, %rax +; CHECK-NEXT: tilestored %tmm3, 2048(%rsp,%rax) # 1024-byte Folded Spill +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq foo +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: movabsq $64, %rax +; CHECK-NEXT: tileloadd 2048(%rsp,%rax), %tmm3 # 1024-byte Folded Reload +; CHECK-NEXT: tileloadd (%rbx,%r15), %tmm0 +; CHECK-NEXT: tileloadd (%rbx,%r15), %tmm1 +; CHECK-NEXT: # implicit-def: $rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movabsq $64, %rax +; CHECK-NEXT: tilestored %tmm3, 1024(%rsp,%rax) # 1024-byte Folded Spill +; CHECK-NEXT: tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm2 # 1024-byte Folded Reload +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; CHECK-NEXT: tdpbssd %tmm1, %tmm0, %tmm2 +; CHECK-NEXT: tilestored %tmm2, (%rbx,%r15) +; CHECK-NEXT: incl %ebp +; CHECK-NEXT: cmpw $100, %bp +; CHECK-NEXT: jl .LBB1_2 +; CHECK-NEXT: .LBB1_3: # %exit +; CHECK-NEXT: addq $4056, %rsp # imm = 0xFD8 +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: tilerelease +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %t1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) + br i1 undef, label %loop.header, label %exit + +loop.header: + %ivphi = phi i16 [0, %entry], [%iv, %loop.latch] + call void @foo() + br label %loop.body + +loop.body: + %t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %buf, i64 32) + %t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %buf, i64 32) + %t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3) + tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %buf, i64 32, x86_amx %t4) + br label %loop.latch + +loop.latch: + %iv = add i16 %ivphi, 1 + %c = icmp slt i16 %iv, 100 + br i1 %c, label %loop.header, label %exit + +exit: + ret void +} + +declare dso_local void @foo() +declare x86_amx @llvm.x86.tilezero.internal(i16, i16) +declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) +declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -45,6 +45,7 @@ ; CHECK-NEXT: Eliminate PHI nodes for register allocation ; CHECK-NEXT: Two-Address instruction pass ; CHECK-NEXT: Fast Register Allocator +; CHECK-NEXT: X86 Lower Tile Copy ; CHECK-NEXT: Bundle Machine CFG Edges ; CHECK-NEXT: X86 FP Stackifier ; CHECK-NEXT: Fixup Statepoint Caller Saved diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -145,6 +145,7 @@ ; CHECK-NEXT: Stack Slot Coloring ; CHECK-NEXT: Machine Copy Propagation Pass ; CHECK-NEXT: Machine Loop Invariant Code Motion +; CHECK-NEXT: X86 Lower Tile Copy ; CHECK-NEXT: Bundle Machine CFG Edges ; CHECK-NEXT: X86 FP Stackifier ; CHECK-NEXT: MachineDominator Tree Construction