diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -266,16 +266,24 @@ for (MachineBasicBlock &MBB : MF) { MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator(); - if (MBBE != MBB.end()) { - MachineInstr &MI = *MBBE; + MachineBasicBlock::iterator TermI = MBBE; + // Check first terminator for VCC branches to optimize + if (TermI != MBB.end()) { + MachineInstr &MI = *TermI; switch (MI.getOpcode()) { case AMDGPU::S_CBRANCH_VCCZ: case AMDGPU::S_CBRANCH_VCCNZ: Changed |= optimizeVccBranch(MI); continue; - case AMDGPU::SI_RETURN_TO_EPILOG: - // FIXME: This is not an optimization and should be - // moved somewhere else. + default: + break; + } + } + // Check all terminators for SI_RETURN_TO_EPILOG + // FIXME: This is not an optimization and should be moved somewhere else. + while (TermI != MBB.end()) { + MachineInstr &MI = *TermI; + if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) { assert(!MF.getInfo()->returnsVoid()); // Graphics shaders returning non-void shouldn't contain S_ENDPGM, @@ -293,11 +301,11 @@ .addMBB(EmptyMBBAtEnd); MI.eraseFromParent(); MBBE = MBB.getFirstTerminator(); + TermI = MBBE; + continue; } - break; - default: - break; } + TermI++; } if (!ST.hasVGPRIndexMode()) diff --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll --- a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll +++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll @@ -78,4 +78,73 @@ unreachable } +define amdgpu_ps { <4 x float> } @test_return_to_epilog_with_optimized_kill(float %val) #0 { + ; GCN-LABEL: name: test_return_to_epilog_with_optimized_kill + ; GCN: bb.0.entry: + ; GCN: successors: %bb.1(0x40000000), %bb.4(0x40000000) + ; GCN: liveins: $vgpr0 + ; GCN: renamable $vgpr1 = nofpexcept V_RCP_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GCN: nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr1, implicit-def $vcc, implicit $mode, implicit $exec + ; GCN: $sgpr0_sgpr1 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GCN: renamable $sgpr0_sgpr1 = S_XOR_B64 $exec, killed renamable $sgpr0_sgpr1, implicit-def dead $scc + ; GCN: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GCN: bb.1.flow.preheader: + ; GCN: successors: %bb.2(0x80000000) + ; GCN: liveins: $vgpr0, $sgpr0_sgpr1 + ; GCN: nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $mode, implicit $exec + ; GCN: renamable $sgpr2_sgpr3 = S_MOV_B64 0 + ; GCN: bb.2.flow: + ; GCN: successors: %bb.3(0x04000000), %bb.2(0x7c000000) + ; GCN: liveins: $vcc, $sgpr0_sgpr1, $sgpr2_sgpr3 + ; GCN: renamable $sgpr4_sgpr5 = S_AND_B64 $exec, renamable $vcc, implicit-def $scc + ; GCN: renamable $sgpr2_sgpr3 = S_OR_B64 killed renamable $sgpr4_sgpr5, killed renamable $sgpr2_sgpr3, implicit-def $scc + ; GCN: $exec = S_ANDN2_B64 $exec, renamable $sgpr2_sgpr3, implicit-def $scc + ; GCN: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GCN: bb.3.Flow: + ; GCN: successors: %bb.4(0x80000000) + ; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 + ; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc + ; GCN: bb.4.Flow1: + ; GCN: successors: %bb.5(0x40000000), %bb.6(0x40000000) + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN: renamable $sgpr0_sgpr1 = S_OR_SAVEEXEC_B64 killed renamable $sgpr0_sgpr1, implicit-def $exec, implicit-def $scc, implicit $exec + ; GCN: $exec = S_XOR_B64 $exec, renamable $sgpr0_sgpr1, implicit-def $scc + ; GCN: S_CBRANCH_EXECZ %bb.6, implicit $exec + ; GCN: bb.5.kill0: + ; GCN: successors: %bb.6(0x80000000) + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN: $exec = S_MOV_B64 0 + ; GCN: bb.6.end: + ; GCN: successors: %bb.7(0x40000000), %bb.8(0x40000000) + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr0_sgpr1, implicit-def $scc + ; GCN: S_CBRANCH_EXECZ %bb.7, implicit $exec + ; GCN: S_BRANCH %bb.8 + ; GCN: bb.7: + ; GCN: EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec + ; GCN: S_ENDPGM 0 + ; GCN: bb.8: +entry: + %.i0 = fdiv reassoc nnan nsz arcp contract afn float 1.000000e+00, %val + %cmp0 = fcmp olt float %.i0, 0.000000e+00 + br i1 %cmp0, label %kill0, label %flow + +kill0: ; preds = %entry + call void @llvm.amdgcn.kill(i1 false) + br label %end + +flow: ; preds = %entry + %cmp1 = fcmp olt float %val, 0.000000e+00 + br i1 %cmp1, label %flow, label %end + +kill1: ; preds = %flow + call void @llvm.amdgcn.kill(i1 false) + br label %end + +end: ; preds = %kill0, %kill1, %flow + ret { <4 x float> } undef +} + +declare void @llvm.amdgcn.kill(i1) #0 + attributes #0 = { nounwind }