diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -166,6 +166,9 @@ void initializeSIRemoveShortExecBranchesPass(PassRegistry &); extern char &SIRemoveShortExecBranchesID; +void initializeSIPreEmitPeepholePass(PassRegistry &); +extern char &SIPreEmitPeepholeID; + void initializeSIInsertSkipsPass(PassRegistry &); extern char &SIInsertSkipsPassID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -240,6 +240,7 @@ initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); initializeSIRemoveShortExecBranchesPass(*PR); + initializeSIPreEmitPeepholePass(*PR); initializeSIInsertSkipsPass(*PR); initializeSIMemoryLegalizerPass(*PR); initializeSIOptimizeExecMaskingPass(*PR); @@ -1029,6 +1030,7 @@ addPass(&PostRAHazardRecognizerID); addPass(&SIRemoveShortExecBranchesID); + addPass(&SIPreEmitPeepholeID); addPass(&SIInsertSkipsPassID); addPass(&BranchRelaxationPassID); } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -121,6 +121,7 @@ SIOptimizeExecMaskingPreRA.cpp SIPeepholeSDWA.cpp SIPostRABundler.cpp + SIPreEmitPeephole.cpp SIRegisterInfo.cpp SIRemoveShortExecBranches.cpp SIShrinkInstructions.cpp diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -68,8 +68,6 @@ bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB); - bool optimizeVccBranch(MachineInstr &MI) const; - public: static char ID; @@ -361,98 +359,6 @@ return true; } -bool SIInsertSkips::optimizeVccBranch(MachineInstr &MI) const { - // Match: - // sreg = -1 - // vcc = S_AND_B64 exec, sreg - // S_CBRANCH_VCC[N]Z - // => - // S_CBRANCH_EXEC[N]Z - bool Changed = false; - MachineBasicBlock &MBB = *MI.getParent(); - const GCNSubtarget &ST = MBB.getParent()->getSubtarget(); - const bool IsWave32 = ST.isWave32(); - const unsigned CondReg = TRI->getVCC(); - const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; - - MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(), - E = MBB.rend(); - bool ReadsCond = false; - unsigned Threshold = 5; - for (++A ; A != E ; ++A) { - if (!--Threshold) - return false; - if (A->modifiesRegister(ExecReg, TRI)) - return false; - if (A->modifiesRegister(CondReg, TRI)) { - if (!A->definesRegister(CondReg, TRI) || A->getOpcode() != And) - return false; - break; - } - ReadsCond |= A->readsRegister(CondReg, TRI); - } - if (A == E) - return false; - - MachineOperand &Op1 = A->getOperand(1); - MachineOperand &Op2 = A->getOperand(2); - if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) { - TII->commuteInstruction(*A); - Changed = true; - } - if (Op1.getReg() != ExecReg) - return Changed; - if (Op2.isImm() && Op2.getImm() != -1) - return Changed; - - unsigned SReg = AMDGPU::NoRegister; - if (Op2.isReg()) { - SReg = Op2.getReg(); - auto M = std::next(A); - bool ReadsSreg = false; - for ( ; M != E ; ++M) { - if (M->definesRegister(SReg, TRI)) - break; - if (M->modifiesRegister(SReg, TRI)) - return Changed; - ReadsSreg |= M->readsRegister(SReg, TRI); - } - if (M == E || - !M->isMoveImmediate() || - !M->getOperand(1).isImm() || - M->getOperand(1).getImm() != -1) - return Changed; - // First if sreg is only used in and instruction fold the immediate - // into that and. - if (!ReadsSreg && Op2.isKill()) { - A->getOperand(2).ChangeToImmediate(-1); - M->eraseFromParent(); - } - } - - if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) && - MI.killsRegister(CondReg, TRI)) - A->eraseFromParent(); - - bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ; - if (SReg == ExecReg) { - if (IsVCCZ) { - MI.eraseFromParent(); - return true; - } - MI.setDesc(TII->get(AMDGPU::S_BRANCH)); - } else { - MI.setDesc(TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ - : AMDGPU::S_CBRANCH_EXECNZ)); - } - - MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI)); - MI.addImplicitDefUseOperands(*MBB.getParent()); - - return true; -} - bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); @@ -534,11 +440,6 @@ } break; - case AMDGPU::S_CBRANCH_VCCZ: - case AMDGPU::S_CBRANCH_VCCNZ: - MadeChange |= optimizeVccBranch(MI); - break; - default: break; } diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -0,0 +1,170 @@ +//===-- SIPreEmitPeephole.cpp ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass performs the peephole optimizations before code emission. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/Support/CommandLine.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-pre-emit-peephole" + +namespace { + +class SIPreEmitPeephole : public MachineFunctionPass { +private: + const SIInstrInfo *TII = nullptr; + const SIRegisterInfo *TRI = nullptr; + + bool optimizeVccBranch(MachineInstr &MI) const; + +public: + static char ID; + + SIPreEmitPeephole() : MachineFunctionPass(ID) { + initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; +}; + +} // End anonymous namespace. + +INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE, + "SI peephole optimizations", false, false) + +char SIPreEmitPeephole::ID = 0; + +char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID; + +bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const { + // Match: + // sreg = -1 + // vcc = S_AND_B64 exec, sreg + // S_CBRANCH_VCC[N]Z + // => + // S_CBRANCH_EXEC[N]Z + // We end up with this pattern sometimes after basic block placement. + // It happens while combining a block which assigns -1 to a saved mask and + // another block which consumes that saved mask and then a branch. + bool Changed = false; + MachineBasicBlock &MBB = *MI.getParent(); + const GCNSubtarget &ST = MBB.getParent()->getSubtarget(); + const bool IsWave32 = ST.isWave32(); + const unsigned CondReg = TRI->getVCC(); + const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; + + MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(), + E = MBB.rend(); + bool ReadsCond = false; + unsigned Threshold = 5; + for (++A; A != E; ++A) { + if (!--Threshold) + return false; + if (A->modifiesRegister(ExecReg, TRI)) + return false; + if (A->modifiesRegister(CondReg, TRI)) { + if (!A->definesRegister(CondReg, TRI) || A->getOpcode() != And) + return false; + break; + } + ReadsCond |= A->readsRegister(CondReg, TRI); + } + if (A == E) + return false; + + MachineOperand &Op1 = A->getOperand(1); + MachineOperand &Op2 = A->getOperand(2); + if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) { + TII->commuteInstruction(*A); + Changed = true; + } + if (Op1.getReg() != ExecReg) + return Changed; + if (Op2.isImm() && Op2.getImm() != -1) + return Changed; + + Register SReg; + if (Op2.isReg()) { + SReg = Op2.getReg(); + auto M = std::next(A); + bool ReadsSreg = false; + for (; M != E; ++M) { + if (M->definesRegister(SReg, TRI)) + break; + if (M->modifiesRegister(SReg, TRI)) + return Changed; + ReadsSreg |= M->readsRegister(SReg, TRI); + } + if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() || + M->getOperand(1).getImm() != -1) + return Changed; + // First if sreg is only used in and instruction fold the immediate + // into that and. + if (!ReadsSreg && Op2.isKill()) { + A->getOperand(2).ChangeToImmediate(-1); + M->eraseFromParent(); + } + } + + if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) && + MI.killsRegister(CondReg, TRI)) + A->eraseFromParent(); + + bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ; + if (SReg == ExecReg) { + if (IsVCCZ) { + MI.eraseFromParent(); + return true; + } + MI.setDesc(TII->get(AMDGPU::S_BRANCH)); + } else { + MI.setDesc( + TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ)); + } + + MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI)); + MI.addImplicitDefUseOperands(*MBB.getParent()); + + return true; +} + +bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget(); + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + bool Changed = false; + + for (MachineBasicBlock &MBB : MF) { + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + if (MBBI == MBB.end()) + continue; + + MachineInstr &MI = *MBBI; + switch (MI.getOpcode()) { + case AMDGPU::S_CBRANCH_VCCZ: + case AMDGPU::S_CBRANCH_VCCNZ: + Changed |= optimizeVccBranch(MI); + break; + + default: + break; + } + } + + return Changed; +} diff --git a/llvm/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir b/llvm/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir --- a/llvm/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir @@ -1,5 +1,5 @@ -# RUN: llc -march=amdgcn -mcpu=fiji -run-pass si-insert-skips -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass si-insert-skips -verify-machineinstrs -o - %s | FileCheck -check-prefix=W32 %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass si-pre-emit-peephole -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass si-pre-emit-peephole -verify-machineinstrs -o - %s | FileCheck -check-prefix=W32 %s --- # GCN-LABEL: name: and_execz_mov_vccz