diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -52,6 +52,10 @@ char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID; +static MCRegister getExecByWaveMode(const GCNSubtarget &ST) { + return ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; +} + /// If \p MI is a copy from exec, return the register copied to. static Register isCopyFromExec(const MachineInstr &MI, const GCNSubtarget &ST) { switch (MI.getOpcode()) { @@ -62,7 +66,7 @@ case AMDGPU::S_MOV_B32_term: { const MachineOperand &Src = MI.getOperand(1); if (Src.isReg() && - Src.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC)) + Src.getReg() == getExecByWaveMode(ST)) return MI.getOperand(0).getReg(); } } @@ -78,7 +82,7 @@ case AMDGPU::S_MOV_B32: { const MachineOperand &Dst = MI.getOperand(0); if (Dst.isReg() && - Dst.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC) && + Dst.getReg() == getExecByWaveMode(ST) && MI.getOperand(1).isReg()) return MI.getOperand(1).getReg(); break; @@ -497,6 +501,57 @@ return true; } +// After all s_op_saveexec instructions are inserted, +// replace (on GFX10.3 and later) +// v_cmp_* SGPR, IMM, VGPR +// s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR +// with +// s_mov_b32 EXEC_SGPR_DEST, exec_lo +// v_cmpx_* IMM, VGPR +// to reduce pipeline stalls. +static bool combineVCmpxAndSaveexec(MachineFunction &MF, const GCNSubtarget &ST, + const SIRegisterInfo *TRI, + MachineRegisterInfo *MRI, + const SIInstrInfo *TII) { + if (!ST.hasGFX10_3Insts()) + return false; + + bool Changed = false; + + DenseMap SaveExecVCmpMapping; + MCRegister Exec = getExecByWaveMode(ST); + const unsigned AndSaveExecOpcode = + ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; + + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + // Record relevant v_cmp / s_and_saveexec instruction pairs for + // replacement. + if (MI.getOpcode() != AndSaveExecOpcode) + continue; + + if (MachineInstr *VCmp = + findPossibleVCMPVCMPXOptimization(MI, Exec, TRI, TII, *MRI)) + SaveExecVCmpMapping[&MI] = VCmp; + } + } + + for (const auto &Entry : SaveExecVCmpMapping) { + MachineInstr *SaveExecInstr = Entry.getFirst(); + MachineInstr *VCmpInstr = Entry.getSecond(); + + if (optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec, TII, TRI, + *MRI)) { + SaveExecInstr->eraseFromParent(); + VCmpInstr->eraseFromParent(); + + Changed = true; + } + } + + return Changed; +} + bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -505,7 +560,7 @@ const SIRegisterInfo *TRI = ST.getRegisterInfo(); const SIInstrInfo *TII = ST.getInstrInfo(); MachineRegisterInfo *MRI = &MF.getRegInfo(); - MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + MCRegister Exec = getExecByWaveMode(ST); // Optimize sequences emitted for control flow lowering. They are originally // emitted as the separate operations because spill code may need to be @@ -668,45 +723,7 @@ Changed = true; } - // After all s_op_saveexec instructions are inserted, - // replace (on GFX10.3 and later) - // v_cmp_* SGPR, IMM, VGPR - // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR - // with - // s_mov_b32 EXEC_SGPR_DEST, exec_lo - // v_cmpx_* IMM, VGPR - // to reduce pipeline stalls. - if (ST.hasGFX10_3Insts()) { - DenseMap SaveExecVCmpMapping; - const unsigned AndSaveExecOpcode = - ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; - - for (MachineBasicBlock &MBB : MF) { - for (MachineInstr &MI : MBB) { - // Record relevant v_cmp / s_and_saveexec instruction pairs for - // replacement. - if (MI.getOpcode() != AndSaveExecOpcode) - continue; - - if (MachineInstr *VCmp = - findPossibleVCMPVCMPXOptimization(MI, Exec, TRI, TII, *MRI)) - SaveExecVCmpMapping[&MI] = VCmp; - } - } - - for (const auto &Entry : SaveExecVCmpMapping) { - MachineInstr *SaveExecInstr = Entry.getFirst(); - MachineInstr *VCmpInstr = Entry.getSecond(); - - if (optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec, TII, - TRI, *MRI)) { - SaveExecInstr->eraseFromParent(); - VCmpInstr->eraseFromParent(); - - Changed = true; - } - } - } + Changed |= combineVCmpxAndSaveexec(MF, ST, TRI, MRI, TII); return Changed; }