diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -210,8 +210,8 @@ void initializeSIPreEmitPeepholePass(PassRegistry &); extern char &SIPreEmitPeepholeID; -void initializeSIInsertSkipsPass(PassRegistry &); -extern char &SIInsertSkipsPassID; +void initializeSILateBranchLoweringPass(PassRegistry &); +extern char &SILateBranchLoweringPassID; void initializeSIOptimizeExecMaskingPass(PassRegistry &); extern char &SIOptimizeExecMaskingID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -251,7 +251,7 @@ initializeSILowerControlFlowPass(*PR); initializeSIRemoveShortExecBranchesPass(*PR); initializeSIPreEmitPeepholePass(*PR); - initializeSIInsertSkipsPass(*PR); + initializeSILateBranchLoweringPass(*PR); initializeSIMemoryLegalizerPass(*PR); initializeSIOptimizeExecMaskingPass(*PR); initializeSIPreAllocateWWMRegsPass(*PR); @@ -1216,8 +1216,9 @@ addPass(&SIInsertHardClausesID); addPass(&SIRemoveShortExecBranchesID); - addPass(&SIInsertSkipsPassID); - addPass(&SIPreEmitPeepholeID); + addPass(&SILateBranchLoweringPassID); + if (getOptLevel() > CodeGenOpt::None) + addPass(&SIPreEmitPeepholeID); // The hazard recognizer that runs as part of the post-ra scheduler does not // guarantee to be able handle all hazards correctly. This is because if there // are multiple scheduling regions in a basic block, the regions are scheduled diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -119,7 +119,7 @@ SIFormMemoryClauses.cpp SIFrameLowering.cpp SIInsertHardClauses.cpp - SIInsertSkips.cpp + SILateBranchLowering.cpp SIInsertWaitcnts.cpp SIInstrInfo.cpp SIISelLowering.cpp diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp rename from llvm/lib/Target/AMDGPU/SIInsertSkips.cpp rename to llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp @@ -1,4 +1,4 @@ -//===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===// +//===-- SILateBranchLowering.cpp - Final preparation of branches ----------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -14,28 +14,23 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "llvm/ADT/DepthFirstIterator.h" +#include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/InitializePasses.h" using namespace llvm; -#define DEBUG_TYPE "si-insert-skips" +#define DEBUG_TYPE "si-late-branch-lowering" namespace { -class SIInsertSkips : public MachineFunctionPass { +class SILateBranchLowering : public MachineFunctionPass { private: const SIRegisterInfo *TRI = nullptr; const SIInstrInfo *TII = nullptr; MachineDominatorTree *MDT = nullptr; - MachineBasicBlock *EarlyExitBlock = nullptr; - bool EarlyExitClearsExec = false; - - void ensureEarlyExitBlock(MachineBasicBlock &MBB, bool ClearExec); - - void earlyTerm(MachineInstr &MI); + void earlyTerm(MachineInstr &MI, MachineBasicBlock *EarlyExitBlock); public: static char ID; @@ -43,12 +38,12 @@ unsigned MovOpc; Register ExecReg; - SIInsertSkips() : MachineFunctionPass(ID) {} + SILateBranchLowering() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { - return "SI insert s_cbranch_execz instructions"; + return "SI Final Branch Preparation"; } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -60,15 +55,15 @@ } // end anonymous namespace -char SIInsertSkips::ID = 0; +char SILateBranchLowering::ID = 0; -INITIALIZE_PASS_BEGIN(SIInsertSkips, DEBUG_TYPE, +INITIALIZE_PASS_BEGIN(SILateBranchLowering, DEBUG_TYPE, "SI insert s_cbranch_execz instructions", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_END(SIInsertSkips, DEBUG_TYPE, +INITIALIZE_PASS_END(SILateBranchLowering, DEBUG_TYPE, "SI insert s_cbranch_execz instructions", false, false) -char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID; +char &llvm::SILateBranchLoweringPassID = SILateBranchLowering::ID; static void generateEndPgm(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, @@ -89,27 +84,6 @@ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0); } -void SIInsertSkips::ensureEarlyExitBlock(MachineBasicBlock &MBB, - bool ClearExec) { - MachineFunction *MF = MBB.getParent(); - DebugLoc DL; - - if (!EarlyExitBlock) { - EarlyExitBlock = MF->CreateMachineBasicBlock(); - MF->insert(MF->end(), EarlyExitBlock); - generateEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII, - MF->getFunction().getCallingConv() == - CallingConv::AMDGPU_PS); - EarlyExitClearsExec = false; - } - - if (ClearExec && !EarlyExitClearsExec) { - auto ExitI = EarlyExitBlock->getFirstNonPHI(); - BuildMI(*EarlyExitBlock, ExitI, DL, TII->get(MovOpc), ExecReg).addImm(0); - EarlyExitClearsExec = true; - } -} - static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI, MachineDominatorTree *MDT) { MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/ true); @@ -125,12 +99,11 @@ MDT->getBase().applyUpdates(DTUpdates); } -void SIInsertSkips::earlyTerm(MachineInstr &MI) { +void SILateBranchLowering::earlyTerm(MachineInstr &MI, + MachineBasicBlock *EarlyExitBlock) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc DL = MI.getDebugLoc(); - ensureEarlyExitBlock(MBB, true); - auto BranchMI = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0)) .addMBB(EarlyExitBlock); auto Next = std::next(MI.getIterator()); @@ -142,7 +115,7 @@ MDT->getBase().insertEdge(&MBB, EarlyExitBlock); } -bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { +bool SILateBranchLowering::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); @@ -152,6 +125,7 @@ ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; SmallVector EarlyTermInstrs; + SmallVector EpilogInstrs; bool MadeChange = false; for (MachineBasicBlock &MBB : MF) { @@ -163,7 +137,7 @@ switch (MI.getOpcode()) { case AMDGPU::S_BRANCH: // Optimize out branches to the next block. - // FIXME: Shouldn't this be handled by BranchFolding? + // This only occurs in -O0 when BranchFolding is not executed. if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) { assert(&MI == &MBB.back()); MI.eraseFromParent(); @@ -175,20 +149,72 @@ EarlyTermInstrs.push_back(&MI); break; + case AMDGPU::SI_RETURN_TO_EPILOG: + EpilogInstrs.push_back(&MI); + break; + default: break; } } } - for (MachineInstr *Instr : EarlyTermInstrs) { - // Early termination in GS does nothing - if (MF.getFunction().getCallingConv() != CallingConv::AMDGPU_GS) - earlyTerm(*Instr); - Instr->eraseFromParent(); + // Lower any early exit branches first + if (!EarlyTermInstrs.empty()) { + MachineBasicBlock *EarlyExitBlock = MF.CreateMachineBasicBlock(); + DebugLoc DL; + + MF.insert(MF.end(), EarlyExitBlock); + BuildMI(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII->get(MovOpc), + ExecReg) + .addImm(0); + generateEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII, + MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS); + + for (MachineInstr *Instr : EarlyTermInstrs) { + // Early termination in GS does nothing + if (MF.getFunction().getCallingConv() != CallingConv::AMDGPU_GS) + earlyTerm(*Instr, EarlyExitBlock); + Instr->eraseFromParent(); + } + + EarlyTermInstrs.clear(); + MadeChange = true; + } + + // Now check return to epilog instructions occur at function end + if (!EpilogInstrs.empty()) { + MachineBasicBlock *EmptyMBBAtEnd = nullptr; + assert(!MF.getInfo()->returnsVoid()); + + // If there are multiple returns to epilog then all will + // become jumps to new empty end block. + if (EpilogInstrs.size() > 1) { + EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); + MF.insert(MF.end(), EmptyMBBAtEnd); + } + + for (auto MI : EpilogInstrs) { + auto MBB = MI->getParent(); + if (MBB == &MF.back() && MI == &MBB->back()) + continue; + + // SI_RETURN_TO_EPILOG is not the last instruction. + // Jump to empty block at function end. + if (!EmptyMBBAtEnd) { + EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); + MF.insert(MF.end(), EmptyMBBAtEnd); + } + + MBB->addSuccessor(EmptyMBBAtEnd); + BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) + .addMBB(EmptyMBBAtEnd); + MI->eraseFromParent(); + MadeChange = true; + } + + EpilogInstrs.clear(); } - EarlyTermInstrs.clear(); - EarlyExitBlock = nullptr; return MadeChange; } diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -14,7 +14,6 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" using namespace llvm; @@ -259,7 +258,6 @@ const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); - MachineBasicBlock *EmptyMBBAtEnd = nullptr; bool Changed = false; for (MachineBasicBlock &MBB : MF) { @@ -277,34 +275,6 @@ break; } } - // Check all terminators for SI_RETURN_TO_EPILOG - // FIXME: This is not an optimization and should be moved somewhere else. - while (TermI != MBB.end()) { - MachineInstr &MI = *TermI; - if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) { - assert(!MF.getInfo()->returnsVoid()); - - // Graphics shaders returning non-void shouldn't contain S_ENDPGM, - // because external bytecode will be appended at the end. - if (&MBB != &MF.back() || &MI != &MBB.back()) { - // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block - // at the end and jump there. - if (!EmptyMBBAtEnd) { - EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); - MF.insert(MF.end(), EmptyMBBAtEnd); - } - - MBB.addSuccessor(EmptyMBBAtEnd); - BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) - .addMBB(EmptyMBBAtEnd); - MI.eraseFromParent(); - MBBE = MBB.getFirstTerminator(); - TermI = MBBE; - continue; - } - } - TermI++; - } if (!ST.hasVGPRIndexMode()) continue; diff --git a/llvm/test/CodeGen/AMDGPU/early-term.mir b/llvm/test/CodeGen/AMDGPU/early-term.mir --- a/llvm/test/CodeGen/AMDGPU/early-term.mir +++ b/llvm/test/CodeGen/AMDGPU/early-term.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-insert-skips -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-late-branch-lowering -verify-machineinstrs %s -o - | FileCheck %s --- | define amdgpu_ps void @early_term_scc0_end_block() { diff --git a/llvm/test/CodeGen/AMDGPU/readlane_exec0.mir b/llvm/test/CodeGen/AMDGPU/readlane_exec0.mir --- a/llvm/test/CodeGen/AMDGPU/readlane_exec0.mir +++ b/llvm/test/CodeGen/AMDGPU/readlane_exec0.mir @@ -1,4 +1,4 @@ -# RUN: llc -o - %s -march=amdgcn -mcpu=fiji -run-pass=si-insert-skips -verify-machineinstrs | FileCheck -check-prefix=GCN %s +# RUN: llc -o - %s -march=amdgcn -mcpu=fiji -run-pass=si-late-branch-lowering -verify-machineinstrs | FileCheck -check-prefix=GCN %s # GCN-LABEL: readlane_exec0 # GCN: bb.0 diff --git a/llvm/test/CodeGen/AMDGPU/shrink-carry.mir b/llvm/test/CodeGen/AMDGPU/shrink-carry.mir --- a/llvm/test/CodeGen/AMDGPU/shrink-carry.mir +++ b/llvm/test/CodeGen/AMDGPU/shrink-carry.mir @@ -1,4 +1,4 @@ -# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -start-before si-shrink-instructions -stop-before si-insert-skips -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -start-before si-shrink-instructions -stop-before si-late-branch-lowering -o - %s | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: subbrev{{$}} # GCN: V_SUBBREV_U32_e32 0, undef $vgpr0, implicit-def $vcc, implicit killed $vcc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/syncscopes.ll b/llvm/test/CodeGen/AMDGPU/syncscopes.ll --- a/llvm/test/CodeGen/AMDGPU/syncscopes.ll +++ b/llvm/test/CodeGen/AMDGPU/syncscopes.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -stop-after=si-insert-skips < %s | FileCheck --check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -stop-after=si-late-branch-lowering < %s | FileCheck --check-prefix=GCN %s ; GCN-LABEL: name: syncscopes ; GCN: FLAT_STORE_DWORD killed renamable $vgpr1_vgpr2, killed renamable $vgpr0, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("agent") seq_cst 4 into %ir.agent_out)