diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1953,6 +1953,18 @@ MI.setDesc(get(AMDGPU::S_AND_B32)); break; + case AMDGPU::S_AND_SAVEEXEC_B64_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64)); + break; + + case AMDGPU::S_AND_SAVEEXEC_B32_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32)); + break; + case AMDGPU::V_MOV_B64_PSEUDO: { Register Dst = MI.getOperand(0).getReg(); Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); @@ -2730,11 +2742,13 @@ case AMDGPU::S_OR_B64_term: case AMDGPU::S_ANDN2_B64_term: case AMDGPU::S_AND_B64_term: + case AMDGPU::S_AND_SAVEEXEC_B64_term: case AMDGPU::S_MOV_B32_term: case AMDGPU::S_XOR_B32_term: case AMDGPU::S_OR_B32_term: case AMDGPU::S_ANDN2_B32_term: case AMDGPU::S_AND_B32_term: + case AMDGPU::S_AND_SAVEEXEC_B32_term: break; case AMDGPU::SI_IF: case AMDGPU::SI_ELSE: diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -306,6 +306,7 @@ def S_OR_B64_term : WrapTerminatorInst; def S_ANDN2_B64_term : WrapTerminatorInst; def S_AND_B64_term : WrapTerminatorInst; +def S_AND_SAVEEXEC_B64_term : WrapTerminatorInst; } let WaveSizePredicate = isWave32 in { @@ -314,6 +315,7 @@ def S_OR_B32_term : WrapTerminatorInst; def S_ANDN2_B32_term : WrapTerminatorInst; def S_AND_B32_term : WrapTerminatorInst; +def S_AND_SAVEEXEC_B32_term : WrapTerminatorInst; } diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -158,10 +158,11 @@ MachinePostDominatorTree *PDT; unsigned AndOpc; + unsigned AndTermOpc; unsigned AndN2Opc; unsigned XorOpc; unsigned AndSaveExecOpc; - unsigned OrSaveExecOpc; + unsigned AndSaveExecTermOpc; unsigned WQMOpc; Register Exec; Register LiveMaskReg; @@ -1206,13 +1207,25 @@ void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, Register SaveWQM) { + bool IsTerminator = Before == MBB.end(); + if (!IsTerminator) { + auto FirstTerm = MBB.getFirstTerminator(); + if (FirstTerm != MBB.end()) { + SlotIndex FirstTermIdx = LIS->getInstructionIndex(*FirstTerm); + SlotIndex BeforeIdx = LIS->getInstructionIndex(*Before); + IsTerminator = BeforeIdx > FirstTermIdx; + } + } + MachineInstr *MI; if (SaveWQM) { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndSaveExecOpc), SaveWQM) + unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc; + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM) .addReg(LiveMaskReg); } else { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndOpc), Exec) + unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc; + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec) .addReg(Exec) .addReg(LiveMaskReg); } @@ -1365,9 +1378,6 @@ Needs = StateExact | StateWQM | StateStrict; } - if (MI.isTerminator() && OutNeeds == StateExact) - Needs = StateExact; - ++Next; } else { // End of basic block @@ -1591,18 +1601,20 @@ if (ST->isWave32()) { AndOpc = AMDGPU::S_AND_B32; + AndTermOpc = AMDGPU::S_AND_B32_term; AndN2Opc = AMDGPU::S_ANDN2_B32; XorOpc = AMDGPU::S_XOR_B32; AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32; - OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32; + AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term; WQMOpc = AMDGPU::S_WQM_B32; Exec = AMDGPU::EXEC_LO; } else { AndOpc = AMDGPU::S_AND_B64; + AndTermOpc = AMDGPU::S_AND_B64_term; AndN2Opc = AMDGPU::S_ANDN2_B64; XorOpc = AMDGPU::S_XOR_B64; AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64; - OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64; + AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term; WQMOpc = AMDGPU::S_WQM_B64; Exec = AMDGPU::EXEC; }