diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -45,6 +45,7 @@ void ensureEarlyExitBlock(MachineBasicBlock &MBB, bool ClearExec); + bool tidySCCDef(MachineInstr &MI); void earlyTerm(MachineInstr &MI); bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB); @@ -191,22 +192,64 @@ MDT->getBase().applyUpdates(DTUpdates); } +bool SIInsertSkips::tidySCCDef(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + + // Peek at the previous instruction in case this can be unconditional + assert(MI.getIterator() != MBB.begin()); + auto Prev = std::prev(MI.getIterator()); + if (Prev->getOpcode() == AMDGPU::S_ANDN2_B32 || + Prev->getOpcode() == AMDGPU::S_ANDN2_B64) { + auto Src0 = Prev->getOperand(1); + auto Src1 = Prev->getOperand(2); + if (Src0.isReg() && Src0.getReg() == ExecReg && Src1.isReg() && + Src1.getReg() == ExecReg) { + // SCC will always be 0; use unconditional branch + Register Dst = Prev->getOperand(0).getReg(); + // Simplify S_ANDN2, remove entirely for exec, as it is set in exit block + if (Dst != ExecReg) { + BuildMI(MBB, Prev, Prev->getDebugLoc(), TII->get(MovOpc), Dst) + .addImm(0); + } + Prev->eraseFromParent(); + return true; + } + } + return false; +} + void SIInsertSkips::earlyTerm(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc DL = MI.getDebugLoc(); ensureEarlyExitBlock(MBB, true); - MachineInstr *BranchMI = - BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0)) - .addMBB(EarlyExitBlock); + // Can we make branch unconditional? + bool ReplaceSuccessor = MBB.succ_size() <= 1; + if (ReplaceSuccessor) + ReplaceSuccessor = tidySCCDef(MI); + + MachineInstr *BranchMI = nullptr; + if (ReplaceSuccessor) { + // Branch is always taken + BranchMI = + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(EarlyExitBlock); + } else { + BranchMI = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0)) + .addMBB(EarlyExitBlock); + } auto Next = std::next(MI.getIterator()); if (Next != MBB.end() && !Next->isTerminator()) splitBlock(MBB, *BranchMI, MDT); MachineBasicBlock *OldSuccessor = nullptr; - MBB.addSuccessor(EarlyExitBlock); + if (ReplaceSuccessor && !MBB.succ_empty()) { + OldSuccessor = *MBB.succ_begin(); + MBB.replaceSuccessor(OldSuccessor, EarlyExitBlock); + } else { + MBB.addSuccessor(EarlyExitBlock); + } // Update MDT MDT->getBase().insertEdge(&MBB, EarlyExitBlock); @@ -277,8 +320,11 @@ for (MachineInstr *Instr : EarlyTermInstrs) { // Early termination in GS does nothing - if (MF.getFunction().getCallingConv() != CallingConv::AMDGPU_GS) + if (MF.getFunction().getCallingConv() != CallingConv::AMDGPU_GS) { earlyTerm(*Instr); + } else { + tidySCCDef(*Instr); + } Instr->eraseFromParent(); } EarlyTermInstrs.clear(); diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll @@ -8,8 +8,7 @@ ; SI-LABEL: static_exact: ; SI: ; %bb.0: ; %.entry ; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 -; SI-NEXT: s_andn2_b64 exec, exec, exec -; SI-NEXT: s_cbranch_scc0 BB0_2 +; SI-NEXT: s_branch BB0_2 ; SI-NEXT: ; %bb.1: ; %.entry ; SI-NEXT: s_mov_b64 exec, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc @@ -23,8 +22,7 @@ ; GFX9-LABEL: static_exact: ; GFX9: ; %bb.0: ; %.entry ; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, exec -; GFX9-NEXT: s_cbranch_scc0 BB0_2 +; GFX9-NEXT: s_branch BB0_2 ; GFX9-NEXT: ; %bb.1: ; %.entry ; GFX9-NEXT: s_mov_b64 exec, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc @@ -38,8 +36,7 @@ ; GFX10-32-LABEL: static_exact: ; GFX10-32: ; %bb.0: ; %.entry ; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, exec_lo -; GFX10-32-NEXT: s_cbranch_scc0 BB0_2 +; GFX10-32-NEXT: s_branch BB0_2 ; GFX10-32-NEXT: ; %bb.1: ; %.entry ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo @@ -53,8 +50,7 @@ ; GFX10-64-LABEL: static_exact: ; GFX10-64: ; %bb.0: ; %.entry ; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 -; GFX10-64-NEXT: s_andn2_b64 exec, exec, exec -; GFX10-64-NEXT: s_cbranch_scc0 BB0_2 +; GFX10-64-NEXT: s_branch BB0_2 ; GFX10-64-NEXT: ; %bb.1: ; %.entry ; GFX10-64-NEXT: s_mov_b64 exec, 0 ; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -10,8 +10,7 @@ ; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg: ; CHECK-NEXT: ; %bb.0: -; CHECK-NEXT: s_andn2_b64 exec, exec, exec -; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB:BB[0-9]+_[0-9]+]] +; CHECK-NEXT: s_branch [[EXIT_BB:BB[0-9]+_[0-9]+]] ; CHECK-NEXT: s_endpgm ; CHECK-NEXT: [[EXIT_BB]]: ; CHECK-NEXT: s_mov_b64 exec, 0