Index: lib/Target/AMDGPU/SIInsertSkips.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertSkips.cpp +++ lib/Target/AMDGPU/SIInsertSkips.cpp @@ -251,6 +251,7 @@ BI != BE; BI = NextBB) { NextBB = std::next(BI); MachineBasicBlock &MBB = *BI; + bool HaveSkipBlock = false; if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) { // Reached convergence point for last divergent branch. @@ -278,8 +279,14 @@ case AMDGPU::S_BRANCH: { // Optimize out branches to the next block. // FIXME: Shouldn't this be handled by BranchFolding? - if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) + if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) { MI.eraseFromParent(); + } else if (HaveSkipBlock) { + // Remove the given unconditional branch when a skip block has been + // inserted after the current one and let skip the two instructions + // performing the kill if the exec mask is non-zero. + MI.eraseFromParent(); + } break; } case AMDGPU::SI_KILL_TERMINATOR: { @@ -288,9 +295,9 @@ if (ExecBranchStack.empty()) { if (skipIfDead(MI, *NextBB)) { + HaveSkipBlock = true; NextBB = std::next(BI); BE = MF.end(); - Next = MBB.end(); } } else { HaveKill = true; Index: test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir @@ -0,0 +1,40 @@ +# RUN: llc -march=amdgcn -mcpu=polaris10 -run-pass si-insert-skips -amdgpu-skip-threshold=1 %s -o - | FileCheck %s + +--- | + define amdgpu_ps void @kill_uncond_branch() { + ret void + } +... +--- + +# CHECK-LABEL: name: kill_uncond_branch + +# CHECK: bb.0: +# CHECK: S_CBRANCH_VCCNZ %bb.1, implicit %vcc + +# CHECK: bb.1: +# CHECK: V_CMPX_LE_F32_e32 +# CHECK-NEXT: S_CBRANCH_EXECNZ %bb.2, implicit %exec + +# CHECK: bb.3: +# CHECK-NEXT: EXP_DONE +# CHECK: S_ENDPGM + +# CHECK: bb.2: +# CHECK: S_ENDPGM + +name: kill_uncond_branch + +body: | + bb.0: + successors: %bb.1 + S_CBRANCH_VCCNZ %bb.1, implicit %vcc + + bb.1: + successors: %bb.2 + %vgpr0 = V_MOV_B32_e32 0, implicit %exec + SI_KILL_TERMINATOR %vgpr0, implicit-def %exec, implicit-def %vcc, implicit %exec + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM