diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -228,6 +228,9 @@ void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &); extern char &AMDGPUPerfHintAnalysisID; +void initializeSIAvoidZeroExecMaskPass(PassRegistry &); +extern char &SIAvoidZeroExecMaskID; + // Passes common to R600 and SI FunctionPass *createAMDGPUPromoteAlloca(); void initializeAMDGPUPromoteAllocaPass(PassRegistry&); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -265,6 +265,7 @@ initializeGCNRegBankReassignPass(*PR); initializeGCNNSAReassignPass(*PR); initializeSIAddIMGInitPass(*PR); + initializeSIAvoidZeroExecMaskPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -1227,6 +1228,7 @@ // cases. addPass(&PostRAHazardRecognizerID); addPass(&BranchRelaxationPassID); + addPass(&SIAvoidZeroExecMaskID); } TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -112,6 +112,7 @@ R600RegisterInfo.cpp SIAddIMGInit.cpp SIAnnotateControlFlow.cpp + SIAvoidZeroExecMask.cpp SIFixSGPRCopies.cpp SIFixVGPRCopies.cpp SIPreAllocateWWMRegs.cpp diff --git a/llvm/lib/Target/AMDGPU/SIAvoidZeroExecMask.cpp b/llvm/lib/Target/AMDGPU/SIAvoidZeroExecMask.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SIAvoidZeroExecMask.cpp @@ -0,0 +1,247 @@ +//===-- SIAvoidZeroExecMask.cpp ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/Support/CommandLine.h" + +using namespace llvm; + +// In case the mask to be restored needs reloading at the beginning of a block, +// relax the eager exec mask evaluation in the corresponding predecessor block. +// +// For instance, transform the following +// +// $exec = /* exec mask evaluation */ +// s_cbranch_execz TARGET +// FALLTHROUGH: +// +// to +// +// $null = /* exec mask evaluation */ +// s_cbranch_scc0 TARGET +// L: +// $exec = /* exec mask evaluation */ +// FALLTRHOUGH: +// +// and transforms the following +// +// $exec = /* exec mask evaluation */ +// s_cbranch_execnz TARGET +// FALLTRHOUGH: +// +// to +// +// $null = /* exec mask evaluation */ +// s_cbranch_scc0 FALLTHROUGH +// L: +// $exec = /* exec mask evaluation */ +// s_branch TARGET +// FALLTRHOUGH: +// + +#define DEBUG_TYPE "si-avoid-zero-exec-mask" +namespace { + +class SIAvoidZeroExecMask : public MachineFunctionPass { +private: + const SIInstrInfo *TII = nullptr; + const SIRegisterInfo *TRI = nullptr; + std::unique_ptr RS; + + bool IsWave32; + Register ExecMask; + + bool isExecMaskRestore(const MachineInstr &MI) const; + MachineOperand *findOnlyImplicitSCCDefOperand(MachineInstr *MI) const; + + bool relaxBranchEXEC(MachineBasicBlock &MBB, MachineBasicBlock *Target) const; + +public: + static char ID; + + SIAvoidZeroExecMask() : MachineFunctionPass(ID) { + initializeSIAvoidZeroExecMaskPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; +}; + +} // End anonymous namespace. + +INITIALIZE_PASS(SIAvoidZeroExecMask, DEBUG_TYPE, "SI avoid zero exec mask", + false, false) + +char SIAvoidZeroExecMask::ID = 0; + +char &llvm::SIAvoidZeroExecMaskID = SIAvoidZeroExecMask::ID; + +bool SIAvoidZeroExecMask::runOnMachineFunction(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget(); + TII = ST.getInstrInfo(); + TRI = ST.getRegisterInfo(); + + IsWave32 = ST.isWave32(); + ExecMask = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + + assert(TRI->trackLivenessAfterRegAlloc(MF)); + RS.reset(new RegScavenger()); + + bool Changed = false; + for (auto MBBI = MF.begin(), MBBE = MF.end(); MBBI != MBBE; /*EMPTY*/) { + auto &MBB = *MBBI++; + auto MII = MBB.getLastNonDebugInstr(); + // Skip if there's no terminator in this block. + if (MII == MBB.end() || !MII->isTerminator()) + continue; + // Consider EXECZ & EXECNZ only. + if (MII->getOpcode() != AMDGPU::S_CBRANCH_EXECZ && + MII->getOpcode() != AMDGPU::S_CBRANCH_EXECNZ) + continue; + // Prepare the block for the exec mask restoration checking. It's the target + // block for EXECZ or the fall-through block otherwise. + MachineBasicBlock *Target = MII->getOperand(0).getMBB(); + if (MII->getOpcode() == AMDGPU::S_CBRANCH_EXECNZ) + Target = MBB.getFallThrough(); + // If there's any readfirstlane, relax the eager exec evaluation in MBB. + bool Found = false; + unsigned Count = 0; + for (auto &MI : *Target) { + // Skip checking if it's unlikely the case. + if (++Count > 8) + break; + Found |= (MI.getOpcode() == AMDGPU::V_READFIRSTLANE_B32); + if (!isExecMaskRestore(MI)) + continue; + // No need to handle the restoration if no mask needs reloading. + if (!Found) + break; + // Otherwise, we need to ensure the target not being branched into with + // zero exec mask. + relaxBranchEXEC(MBB, Target); + break; + } + } + + return Changed; +} + +// Check that's an execution mask restore instruction, which is implemented as +// +// OR $exec, $exec, %mask +// +bool SIAvoidZeroExecMask::isExecMaskRestore(const MachineInstr &MI) const { + unsigned MaskOR = IsWave32 ? AMDGPU::S_OR_B32 : AMDGPU::S_OR_B64; + if (MI.getOpcode() != MaskOR) + return false; + const MachineOperand &Dst = MI.getOperand(0); + if (!Dst.isReg() || Dst.getReg() != ExecMask) + return false; + const MachineOperand &S0 = MI.getOperand(1); + if (!S0.isReg() || S0.getReg() != ExecMask) + return false; + return true; +} + +MachineOperand * +SIAvoidZeroExecMask::findOnlyImplicitSCCDefOperand(MachineInstr *MI) const { + if (MI->getDesc().getNumImplicitDefs() != 1) + return nullptr; + for (unsigned I = MI->getNumExplicitOperands(), E = MI->getNumOperands(); + I != E; ++I) { + MachineOperand &MO = MI->getOperand(I); + if (MO.isDef() && MO.isImplicit() && MO.getReg() == AMDGPU::SCC) + return &MO; + } + return nullptr; +} + +bool SIAvoidZeroExecMask::relaxBranchEXEC(MachineBasicBlock &MBB, + MachineBasicBlock *Target) const { + auto MBBI = MBB.getFirstTerminator(); + // Skip if there's no terminator. + if (MBBI == MBB.end()) + return false; + if (MBBI->getOpcode() != AMDGPU::S_CBRANCH_EXECZ && + MBBI->getOpcode() != AMDGPU::S_CBRANCH_EXECNZ) + return false; + // Skip if there's no exec mask evaluation. + if (MBBI == MBB.begin()) + return false; + auto Br = &*MBBI; + + unsigned MaskMov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + unsigned MaskAnd = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; + + // Check whether the previous instruction is the evaluation of the exec + // mask. + --MBBI; + if (MBBI->getNumExplicitDefs() == 0) + return false; + auto &Op = MBBI->getOperand(0); + if (!Op.isReg() || Op.getReg() != ExecMask) + return false; + // That evaluation should only implicitly define SCC. The only exception is + // S_MOV. + if (MBBI->getOpcode() == MaskMov || !findOnlyImplicitSCCDefOperand(&*MBBI)) + return false; + auto ExecEval = &*MBBI; + + MachineFunction *MF = MBB.getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + MachineInstr *Cloned = nullptr; + // Clone that mask evaluation with a temp destination. As SCC is updated with + // the same result, branch off on SCC0 before zero mask is evaluated. + if (ExecEval->getOpcode() == MaskMov) { + Cloned = + BuildMI(MBB, ExecEval, ExecEval->getDebugLoc(), TII->get(MaskAnd), Tmp) + .addReg(ExecEval->getOperand(1).getReg()) + .addReg(ExecEval->getOperand(1).getReg()); + } else { + Cloned = MF->CloneMachineInstr(&*ExecEval); + MBB.insert(ExecEval, Cloned); + } + Cloned->clearKillInfo(); + Cloned->getOperand(0).setReg(Tmp); + Cloned->getOperand(0).setIsDead(); + // Find a temp sreg as the destination of the duplicated evaluation. We only + // care the updated SCC. + RS->enterBasicBlockEnd(MBB); + unsigned Scav = RS->scavengeRegisterBackwards( + AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(Cloned), false, 0); + MRI.replaceRegWith(Tmp, Scav); + MRI.clearVirtRegs(); + RS->setRegUsed(Scav); + // Clear dead flag on that SCC implicit def. + findOnlyImplicitSCCDefOperand(Cloned)->setIsDead(false); + + // Split the block. + auto L = MBB.splitAt(*Cloned, true); + + // Add branch on scc0. + BuildMI(&MBB, Br->getDebugLoc(), TII->get(AMDGPU::S_CBRANCH_SCC0)) + .addMBB(Target); + MBB.addSuccessor(Target); + if (Br->getOpcode() == AMDGPU::S_CBRANCH_EXECZ) { + // Remove the orignal branch on execz. + Br->eraseFromParent(); + } else { + // Replace the original branch on execnz with unconditional branch. + assert(Br->getOpcode() == AMDGPU::S_CBRANCH_EXECNZ); + Br->setDesc(TII->get(AMDGPU::S_BRANCH)); + } + // Remove `Target` from L's successors. + L->removeSuccessor(Target, true); + + return true; +} diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll --- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -220,8 +220,12 @@ ; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_HI:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_HI]] ; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_HI]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill -; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_AND_EXEC_LO]]:[[FLOW_AND_EXEC_HI]]{{\]}} -; GCN-NEXT: s_cbranch_execz [[ENDIF:BB[0-9]+_[0-9]+]] +; VMEM: s_xor_b64 s[{{[0-9]+:[0-9]+}}], exec, s{{\[}}[[FLOW_AND_EXEC_LO]]:[[FLOW_AND_EXEC_HI]]{{\]}} +; VMEM-NEXT: s_cbranch_scc0 [[ENDIF:BB[0-9]+_[0-9]+]] +; VMEM-NEXT: ; %bb.{{[0-9]+}}: ; %Flow +; VMEM-NEXT: s_xor_b64 exec, exec, s{{\[}}[[FLOW_AND_EXEC_LO]]:[[FLOW_AND_EXEC_HI]]{{\]}} +; VGPR: s_xor_b64 exec, exec, s{{\[}}[[FLOW_AND_EXEC_LO]]:[[FLOW_AND_EXEC_HI]]{{\]}} +; VGPR-NEXT: s_cbranch_execz [[ENDIF:BB[0-9]+_[0-9]+]] ; GCN: ; %bb.{{[0-9]+}}: ; %if