diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -9,6 +9,7 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIRegisterInfo.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/InitializePasses.h" @@ -20,10 +21,40 @@ namespace { class SIOptimizeExecMasking : public MachineFunctionPass { + MachineFunction *MF = nullptr; + const GCNSubtarget *ST = nullptr; + const SIRegisterInfo *TRI = nullptr; + const SIInstrInfo *TII = nullptr; + const MachineRegisterInfo *MRI = nullptr; + + Register isCopyFromExec(const MachineInstr &MI) const; + Register isCopyToExec(const MachineInstr &MI) const; + bool removeTerminatorBit(MachineInstr &MI) const; + MachineBasicBlock::reverse_iterator + fixTerminators(MachineBasicBlock &MBB) const; + MachineBasicBlock::reverse_iterator + findExecCopy(MachineBasicBlock &MBB, MachineBasicBlock::reverse_iterator I, + unsigned CopyToExec) const; + + bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start, + MCRegister Reg, bool UseLiveOuts = false, + bool IgnoreStart = false) const; + bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg) const; + MachineInstr *findInstrBackwards(MachineInstr &Origin, + std::function Pred, + ArrayRef NonModifiableRegs, + unsigned MaxInstructions = 20) const; + MachineInstr *findPossibleVCMPVCMPXOptimization(MachineInstr &SaveExec, + MCRegister Exec) const; + bool optimizeExecSequence() const; + bool optimizeVCmpxAndSaveexecSequence() const; + bool optimizeSingleVCMPSaveExecSequence(MachineInstr &SaveExecInstr, + MachineInstr &VCmp, + MCRegister Exec) const; + public: static char ID; -public: SIOptimizeExecMasking() : MachineFunctionPass(ID) { initializeSIOptimizeExecMaskingPass(*PassRegistry::getPassRegistry()); } @@ -53,7 +84,7 @@ char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID; /// If \p MI is a copy from exec, return the register copied to. -static Register isCopyFromExec(const MachineInstr &MI, const GCNSubtarget &ST) { +Register SIOptimizeExecMasking::isCopyFromExec(const MachineInstr &MI) const { switch (MI.getOpcode()) { case AMDGPU::COPY: case AMDGPU::S_MOV_B64: @@ -61,8 +92,7 @@ case AMDGPU::S_MOV_B32: case AMDGPU::S_MOV_B32_term: { const MachineOperand &Src = MI.getOperand(1); - if (Src.isReg() && - Src.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC)) + if (Src.isReg() && Src.getReg() == TRI->getExec()) return MI.getOperand(0).getReg(); } } @@ -71,14 +101,13 @@ } /// If \p MI is a copy to exec, return the register copied from. -static Register isCopyToExec(const MachineInstr &MI, const GCNSubtarget &ST) { +Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const { switch (MI.getOpcode()) { case AMDGPU::COPY: case AMDGPU::S_MOV_B64: case AMDGPU::S_MOV_B32: { const MachineOperand &Dst = MI.getOperand(0); - if (Dst.isReg() && - Dst.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC) && + if (Dst.isReg() && Dst.getReg() == TRI->getExec() && MI.getOperand(1).isReg()) return MI.getOperand(1).getReg(); break; @@ -173,64 +202,64 @@ // These are only terminators to get correct spill code placement during // register allocation, so turn them back into normal instructions. -static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) { +bool SIOptimizeExecMasking::removeTerminatorBit(MachineInstr &MI) const { switch (MI.getOpcode()) { case AMDGPU::S_MOV_B32_term: { bool RegSrc = MI.getOperand(1).isReg(); - MI.setDesc(TII.get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32)); + MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32)); return true; } case AMDGPU::S_MOV_B64_term: { bool RegSrc = MI.getOperand(1).isReg(); - MI.setDesc(TII.get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64)); + MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64)); return true; } case AMDGPU::S_XOR_B64_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_XOR_B64)); + MI.setDesc(TII->get(AMDGPU::S_XOR_B64)); return true; } case AMDGPU::S_XOR_B32_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_XOR_B32)); + MI.setDesc(TII->get(AMDGPU::S_XOR_B32)); return true; } case AMDGPU::S_OR_B64_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_OR_B64)); + MI.setDesc(TII->get(AMDGPU::S_OR_B64)); return true; } case AMDGPU::S_OR_B32_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_OR_B32)); + MI.setDesc(TII->get(AMDGPU::S_OR_B32)); return true; } case AMDGPU::S_ANDN2_B64_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_ANDN2_B64)); + MI.setDesc(TII->get(AMDGPU::S_ANDN2_B64)); return true; } case AMDGPU::S_ANDN2_B32_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32)); + MI.setDesc(TII->get(AMDGPU::S_ANDN2_B32)); return true; } case AMDGPU::S_AND_B64_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_AND_B64)); + MI.setDesc(TII->get(AMDGPU::S_AND_B64)); return true; } case AMDGPU::S_AND_B32_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_AND_B32)); + MI.setDesc(TII->get(AMDGPU::S_AND_B32)); return true; } default: @@ -241,9 +270,8 @@ // Turn all pseudoterminators in the block into their equivalent non-terminator // instructions. Returns the reverse iterator to the first non-terminator // instruction in the block. -static MachineBasicBlock::reverse_iterator fixTerminators( - const SIInstrInfo &TII, - MachineBasicBlock &MBB) { +MachineBasicBlock::reverse_iterator +SIOptimizeExecMasking::fixTerminators(MachineBasicBlock &MBB) const { MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend(); bool Seen = false; @@ -252,7 +280,7 @@ if (!I->isTerminator()) return Seen ? FirstNonTerm : I; - if (removeTerminatorBit(TII, *I)) { + if (removeTerminatorBit(*I)) { if (!Seen) { FirstNonTerm = I; Seen = true; @@ -263,17 +291,15 @@ return FirstNonTerm; } -static MachineBasicBlock::reverse_iterator findExecCopy( - const SIInstrInfo &TII, - const GCNSubtarget &ST, - MachineBasicBlock &MBB, - MachineBasicBlock::reverse_iterator I, - unsigned CopyToExec) { +MachineBasicBlock::reverse_iterator +SIOptimizeExecMasking::findExecCopy(MachineBasicBlock &MBB, + MachineBasicBlock::reverse_iterator I, + unsigned CopyToExec) const { const unsigned InstLimit = 25; auto E = MBB.rend(); for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) { - Register CopyFromExec = isCopyFromExec(*I, ST); + Register CopyFromExec = isCopyFromExec(*I); if (CopyFromExec.isValid()) return I; } @@ -298,11 +324,9 @@ // an arbitrary condition based on the current MachineInstr, for instance an // target instruction. Breaks prematurely by returning nullptr if one of the // registers given in NonModifiableRegs is modified by the current instruction. -static MachineInstr * -findInstrBackwards(MachineInstr &Origin, - std::function Pred, - ArrayRef NonModifiableRegs, - const SIRegisterInfo *TRI, unsigned MaxInstructions = 20) { +MachineInstr *SIOptimizeExecMasking::findInstrBackwards( + MachineInstr &Origin, std::function Pred, + ArrayRef NonModifiableRegs, unsigned MaxInstructions) const { MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(), E = Origin.getParent()->rend(); unsigned CurrentIteration = 0; @@ -310,7 +334,7 @@ for (++A; CurrentIteration < MaxInstructions && A != E; ++A) { if (A->isDebugInstr()) continue; - + if (Pred(&*A)) return &*A; @@ -318,209 +342,64 @@ if (A->modifiesRegister(Reg, TRI)) return nullptr; } - + ++CurrentIteration; } return nullptr; } - // Determine if a register Reg is not re-defined and still in use // in the range (Stop..Start]. // It does so by backwards calculating liveness from the end of the BB until // either Stop or the beginning of the BB is reached. // After liveness is calculated, we can determine if Reg is still in use and not // defined inbetween the instructions. -static bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start, - MCRegister Reg, const SIRegisterInfo *TRI, - MachineRegisterInfo &MRI, - bool useLiveOuts = false, - bool ignoreStart = false) { +bool SIOptimizeExecMasking::isRegisterInUseBetween(MachineInstr &Stop, + MachineInstr &Start, + MCRegister Reg, + bool UseLiveOuts, + bool IgnoreStart) const { LivePhysRegs LR(*TRI); - if (useLiveOuts) + if (UseLiveOuts) LR.addLiveOuts(*Stop.getParent()); MachineBasicBlock::reverse_iterator A(Start); MachineBasicBlock::reverse_iterator E(Stop); - if (ignoreStart) + if (IgnoreStart) ++A; for (; A != Stop.getParent()->rend() && A != Stop; ++A) { LR.stepBackward(*A); } - return !LR.available(MRI, Reg); + return !LR.available(*MRI, Reg); } // Determine if a register Reg is not re-defined and still in use // in the range (Stop..BB.end]. -static bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg, - const SIRegisterInfo *TRI, - MachineRegisterInfo &MRI) { - return isRegisterInUseBetween(Stop, *Stop.getParent()->rbegin(), Reg, TRI, - MRI, true); +bool SIOptimizeExecMasking::isRegisterInUseAfter(MachineInstr &Stop, + MCRegister Reg) const { + return isRegisterInUseBetween(Stop, *Stop.getParent()->rbegin(), Reg, true); } -// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence -// by looking at an instance of a s_and_saveexec instruction. Returns a pointer -// to the v_cmp instruction if it is safe to replace the sequence (see the -// conditions in the function body). This is after register allocation, so some -// checks on operand dependencies need to be considered. -static MachineInstr *findPossibleVCMPVCMPXOptimization( - MachineInstr &SaveExec, MCRegister Exec, const SIRegisterInfo *TRI, - const SIInstrInfo *TII, MachineRegisterInfo &MRI) { - - MachineInstr *VCmp = nullptr; - - Register SaveExecDest = SaveExec.getOperand(0).getReg(); - if (!TRI->isSGPRReg(MRI, SaveExecDest)) - return nullptr; - - MachineOperand *SaveExecSrc0 = - TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0); - if (!SaveExecSrc0->isReg()) - return nullptr; - - // Try to find the last v_cmp instruction that defs the saveexec input - // operand without any write to Exec or the saveexec input operand inbetween. - VCmp = findInstrBackwards( - SaveExec, - [&](MachineInstr *Check) { - return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 && - Check->modifiesRegister(SaveExecSrc0->getReg(), TRI); - }, - {Exec, SaveExecSrc0->getReg()}, TRI); - - if (!VCmp) - return nullptr; - - MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst); - assert(VCmpDest && "Should have an sdst operand!"); - - // Check if any of the v_cmp source operands is written by the saveexec. - MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0); - if (Src0->isReg() && TRI->isSGPRReg(MRI, Src0->getReg()) && - SaveExec.modifiesRegister(Src0->getReg(), TRI)) - return nullptr; - - MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1); - if (Src1->isReg() && TRI->isSGPRReg(MRI, Src1->getReg()) && - SaveExec.modifiesRegister(Src1->getReg(), TRI)) - return nullptr; - - // Don't do the transformation if the destination operand is included in - // it's MBB Live-outs, meaning it's used in any of it's successors, leading - // to incorrect code if the v_cmp and therefore the def of - // the dest operand is removed. - if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg())) - return nullptr; - - // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the - // s_and_saveexec, skip the optimization. - if (isRegisterInUseBetween(*VCmp, SaveExec, VCmpDest->getReg(), TRI, MRI, - false, true) || - isRegisterInUseAfter(SaveExec, VCmpDest->getReg(), TRI, MRI)) - return nullptr; - - // Try to determine if there is a write to any of the VCmp - // operands between the saveexec and the vcmp. - // If yes, additional VGPR spilling might need to be inserted. In this case, - // it's not worth replacing the instruction sequence. - SmallVector NonDefRegs; - if (Src0->isReg()) - NonDefRegs.push_back(Src0->getReg()); - - if (Src1->isReg()) - NonDefRegs.push_back(Src1->getReg()); - - if (!findInstrBackwards( - SaveExec, [&](MachineInstr *Check) { return Check == VCmp; }, - NonDefRegs, TRI)) - return nullptr; - - return VCmp; -} - -// Inserts the optimized s_mov_b32 / v_cmpx sequence based on the -// operands extracted from a v_cmp ..., s_and_saveexec pattern. -static bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr, - MachineInstr &VCmp, MCRegister Exec, - const SIInstrInfo *TII, - const SIRegisterInfo *TRI, - MachineRegisterInfo &MRI) { - const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode()); - - if (NewOpcode == -1) - return false; - - MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0); - MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1); - - Register MoveDest = SaveExecInstr.getOperand(0).getReg(); - - MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator(); - if (!SaveExecInstr.uses().empty()) { - bool isSGPR32 = TRI->getRegSizeInBits(MoveDest, MRI) == 32; - unsigned MovOpcode = isSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - BuildMI(*SaveExecInstr.getParent(), InsertPosIt, - SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest) - .addReg(Exec); - } - - // Omit dst as V_CMPX is implicitly writing to EXEC. - // Add dummy src and clamp modifiers, if needed. - auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt), - VCmp.getDebugLoc(), TII->get(NewOpcode)); - - auto TryAddImmediateValueFromNamedOperand = - [&](unsigned OperandName) -> void { - if (auto *Mod = TII->getNamedOperand(VCmp, OperandName)) - Builder.addImm(Mod->getImm()); - }; - - TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers); - Builder.add(*Src0); - - TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers); - Builder.add(*Src1); - - TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp); - - // The kill flags may no longer be correct. - if (Src0->isReg()) - MRI.clearKillFlags(Src0->getReg()); - if (Src1->isReg()) - MRI.clearKillFlags(Src1->getReg()); - - return true; -} - -bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(MF.getFunction())) - return false; - - const GCNSubtarget &ST = MF.getSubtarget(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - const SIInstrInfo *TII = ST.getInstrInfo(); - MachineRegisterInfo *MRI = &MF.getRegInfo(); - MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - - // Optimize sequences emitted for control flow lowering. They are originally - // emitted as the separate operations because spill code may need to be - // inserted for the saved copy of exec. - // - // x = copy exec - // z = s__b64 x, y - // exec = copy z - // => - // x = s__saveexec_b64 y - // +// Optimize sequences emitted for control flow lowering. They are originally +// emitted as the separate operations because spill code may need to be +// inserted for the saved copy of exec. +// +// x = copy exec +// z = s__b64 x, y +// exec = copy z +// => +// x = s__saveexec_b64 y +// +bool SIOptimizeExecMasking::optimizeExecSequence() const { + MCRegister Exec = TRI->getExec(); bool Changed = false; - for (MachineBasicBlock &MBB : MF) { - MachineBasicBlock::reverse_iterator I = fixTerminators(*TII, MBB); + for (MachineBasicBlock &MBB : *MF) { + MachineBasicBlock::reverse_iterator I = fixTerminators(MBB); MachineBasicBlock::reverse_iterator E = MBB.rend(); if (I == E) continue; @@ -532,7 +411,7 @@ unsigned SearchCount = 0; const unsigned SearchLimit = 5; while (I != E && SearchCount++ < SearchLimit) { - CopyToExec = isCopyToExec(*I, ST); + CopyToExec = isCopyToExec(*I); if (CopyToExec) break; ++I; @@ -542,8 +421,8 @@ continue; // Scan backwards to find the def. - auto CopyToExecInst = &*I; - auto CopyFromExecInst = findExecCopy(*TII, ST, MBB, I, CopyToExec); + auto *CopyToExecInst = &*I; + auto CopyFromExecInst = findExecCopy(MBB, I, CopyToExec); if (CopyFromExecInst == E) { auto PrepareExecInst = std::next(I); if (PrepareExecInst == E) @@ -574,8 +453,9 @@ MachineInstr *SaveExecInst = nullptr; SmallVector OtherUseInsts; - for (MachineBasicBlock::iterator J - = std::next(CopyFromExecInst->getIterator()), JE = I->getIterator(); + for (MachineBasicBlock::iterator + J = std::next(CopyFromExecInst->getIterator()), + JE = I->getIterator(); J != JE; ++J) { if (SaveExecInst && J->readsRegister(Exec, TRI)) { LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n'); @@ -655,58 +535,210 @@ BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())), CopyFromExec) - .addReg(OtherOp->getReg()); + .addReg(OtherOp->getReg()); SaveExecInst->eraseFromParent(); CopyToExecInst->eraseFromParent(); for (MachineInstr *OtherInst : OtherUseInsts) { - OtherInst->substituteRegister(CopyToExec, Exec, - AMDGPU::NoSubRegister, *TRI); + OtherInst->substituteRegister(CopyToExec, Exec, AMDGPU::NoSubRegister, + *TRI); } Changed = true; } - // After all s_op_saveexec instructions are inserted, - // replace (on GFX10.3 and later) - // v_cmp_* SGPR, IMM, VGPR - // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR - // with - // s_mov_b32 EXEC_SGPR_DEST, exec_lo - // v_cmpx_* IMM, VGPR - // to reduce pipeline stalls. - if (ST.hasGFX10_3Insts()) { - DenseMap SaveExecVCmpMapping; - const unsigned AndSaveExecOpcode = - ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; - - for (MachineBasicBlock &MBB : MF) { - for (MachineInstr &MI : MBB) { - // Record relevant v_cmp / s_and_saveexec instruction pairs for - // replacement. - if (MI.getOpcode() != AndSaveExecOpcode) - continue; + return Changed; +} - if (MachineInstr *VCmp = - findPossibleVCMPVCMPXOptimization(MI, Exec, TRI, TII, *MRI)) - SaveExecVCmpMapping[&MI] = VCmp; - } +// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence +// by looking at an instance of a s_and_saveexec instruction. Returns a pointer +// to the v_cmp instruction if it is safe to replace the sequence (see the +// conditions in the function body). This is after register allocation, so some +// checks on operand dependencies need to be considered. +MachineInstr *SIOptimizeExecMasking::findPossibleVCMPVCMPXOptimization( + MachineInstr &SaveExec, MCRegister Exec) const { + + MachineInstr *VCmp = nullptr; + + Register SaveExecDest = SaveExec.getOperand(0).getReg(); + if (!TRI->isSGPRReg(*MRI, SaveExecDest)) + return nullptr; + + MachineOperand *SaveExecSrc0 = + TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0); + if (!SaveExecSrc0->isReg()) + return nullptr; + + // Try to find the last v_cmp instruction that defs the saveexec input + // operand without any write to Exec or the saveexec input operand inbetween. + VCmp = findInstrBackwards( + SaveExec, + [&](MachineInstr *Check) { + return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 && + Check->modifiesRegister(SaveExecSrc0->getReg(), TRI); + }, + {Exec, SaveExecSrc0->getReg()}); + + if (!VCmp) + return nullptr; + + MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst); + assert(VCmpDest && "Should have an sdst operand!"); + + // Check if any of the v_cmp source operands is written by the saveexec. + MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0); + if (Src0->isReg() && TRI->isSGPRReg(*MRI, Src0->getReg()) && + SaveExec.modifiesRegister(Src0->getReg(), TRI)) + return nullptr; + + MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1); + if (Src1->isReg() && TRI->isSGPRReg(*MRI, Src1->getReg()) && + SaveExec.modifiesRegister(Src1->getReg(), TRI)) + return nullptr; + + // Don't do the transformation if the destination operand is included in + // it's MBB Live-outs, meaning it's used in any of it's successors, leading + // to incorrect code if the v_cmp and therefore the def of + // the dest operand is removed. + if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg())) + return nullptr; + + // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the + // s_and_saveexec, skip the optimization. + if (isRegisterInUseBetween(*VCmp, SaveExec, VCmpDest->getReg(), false, + true) || + isRegisterInUseAfter(SaveExec, VCmpDest->getReg())) + return nullptr; + + // Try to determine if there is a write to any of the VCmp + // operands between the saveexec and the vcmp. + // If yes, additional VGPR spilling might need to be inserted. In this case, + // it's not worth replacing the instruction sequence. + SmallVector NonDefRegs; + if (Src0->isReg()) + NonDefRegs.push_back(Src0->getReg()); + + if (Src1->isReg()) + NonDefRegs.push_back(Src1->getReg()); + + if (!findInstrBackwards( + SaveExec, [&](MachineInstr *Check) { return Check == VCmp; }, + NonDefRegs)) + return nullptr; + + return VCmp; +} + +// Inserts the optimized s_mov_b32 / v_cmpx sequence based on the +// operands extracted from a v_cmp ..., s_and_saveexec pattern. +bool SIOptimizeExecMasking::optimizeSingleVCMPSaveExecSequence( + MachineInstr &SaveExecInstr, MachineInstr &VCmp, MCRegister Exec) const { + const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode()); + + if (NewOpcode == -1) + return false; + + MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0); + MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1); + + Register MoveDest = SaveExecInstr.getOperand(0).getReg(); + + MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator(); + if (!SaveExecInstr.uses().empty()) { + bool IsSGPR32 = TRI->getRegSizeInBits(MoveDest, *MRI) == 32; + unsigned MovOpcode = IsSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + BuildMI(*SaveExecInstr.getParent(), InsertPosIt, + SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest) + .addReg(Exec); + } + + // Omit dst as V_CMPX is implicitly writing to EXEC. + // Add dummy src and clamp modifiers, if needed. + auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt), + VCmp.getDebugLoc(), TII->get(NewOpcode)); + + auto TryAddImmediateValueFromNamedOperand = + [&](unsigned OperandName) -> void { + if (auto *Mod = TII->getNamedOperand(VCmp, OperandName)) + Builder.addImm(Mod->getImm()); + }; + + TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers); + Builder.add(*Src0); + + TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers); + Builder.add(*Src1); + + TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp); + + // The kill flags may no longer be correct. + if (Src0->isReg()) + MRI->clearKillFlags(Src0->getReg()); + if (Src1->isReg()) + MRI->clearKillFlags(Src1->getReg()); + + return true; +} + +// After all s_op_saveexec instructions are inserted, +// replace (on GFX10.3 and later) +// v_cmp_* SGPR, IMM, VGPR +// s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR +// with +// s_mov_b32 EXEC_SGPR_DEST, exec_lo +// v_cmpx_* IMM, VGPR +// to reduce pipeline stalls. +bool SIOptimizeExecMasking::optimizeVCmpxAndSaveexecSequence() const { + if (!ST->hasGFX10_3Insts()) + return false; + + bool Changed = false; + + DenseMap SaveExecVCmpMapping; + MCRegister Exec = TRI->getExec(); + const unsigned AndSaveExecOpcode = + ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; + + for (MachineBasicBlock &MBB : *MF) { + for (MachineInstr &MI : MBB) { + // Record relevant v_cmp / s_and_saveexec instruction pairs for + // replacement. + if (MI.getOpcode() != AndSaveExecOpcode) + continue; + + if (MachineInstr *VCmp = findPossibleVCMPVCMPXOptimization(MI, Exec)) + SaveExecVCmpMapping[&MI] = VCmp; } + } - for (const auto &Entry : SaveExecVCmpMapping) { - MachineInstr *SaveExecInstr = Entry.getFirst(); - MachineInstr *VCmpInstr = Entry.getSecond(); + for (const auto &Entry : SaveExecVCmpMapping) { + MachineInstr *SaveExecInstr = Entry.getFirst(); + MachineInstr *VCmpInstr = Entry.getSecond(); - if (optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec, TII, - TRI, *MRI)) { - SaveExecInstr->eraseFromParent(); - VCmpInstr->eraseFromParent(); + if (optimizeSingleVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec)) { + SaveExecInstr->eraseFromParent(); + VCmpInstr->eraseFromParent(); - Changed = true; - } + Changed = true; } } return Changed; } + +bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + this->MF = &MF; + ST = &MF.getSubtarget(); + TRI = ST->getRegisterInfo(); + TII = ST->getInstrInfo(); + MRI = &MF.getRegInfo(); + + bool Changed = optimizeExecSequence(); + Changed |= optimizeVCmpxAndSaveexecSequence(); + + return Changed; +} diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -344,6 +344,8 @@ MCRegister getVCC() const; + MCRegister getExec() const; + const TargetRegisterClass *getRegClass(unsigned RCID) const; // Find reaching register definition diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -2933,6 +2933,10 @@ return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; } +MCRegister SIRegisterInfo::getExec() const { + return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; +} + const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const { // VGPR tuples have an alignment requirement on gfx90a variants. return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass