diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -366,7 +366,6 @@ MDT = &getAnalysis(); SkipThreshold = SkipThresholdFlag; - MachineBasicBlock *EmptyMBBAtEnd = nullptr; SmallVector KillInstrs; bool MadeChange = false; @@ -417,29 +416,6 @@ break; } - case AMDGPU::SI_RETURN_TO_EPILOG: - // FIXME: Should move somewhere else - assert(!MF.getInfo()->returnsVoid()); - - // Graphics shaders returning non-void shouldn't contain S_ENDPGM, - // because external bytecode will be appended at the end. - if (&MBB != &MF.back() || &MI != &MBB.back()) { - // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at - // the end and jump there. - if (!EmptyMBBAtEnd) { - EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); - MF.insert(MF.end(), EmptyMBBAtEnd); - } - - MBB.addSuccessor(EmptyMBBAtEnd); - BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) - .addMBB(EmptyMBBAtEnd); - MI.eraseFromParent(); - - MDT->getBase().insertEdge(&MBB, EmptyMBBAtEnd); - } - break; - default: break; } diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -15,6 +15,7 @@ #include "AMDGPUSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/Support/CommandLine.h" @@ -198,6 +199,7 @@ const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); + MachineBasicBlock *EmptyMBBAtEnd = nullptr; bool Changed = false; for (MachineBasicBlock &MBB : MF) { @@ -209,6 +211,28 @@ case AMDGPU::S_CBRANCH_VCCNZ: Changed |= optimizeVccBranch(MI); continue; + case AMDGPU::SI_RETURN_TO_EPILOG: + // FIXME: This is not an optimization and should be + // moved somewhere else. + assert(!MF.getInfo()->returnsVoid()); + + // Graphics shaders returning non-void shouldn't contain S_ENDPGM, + // because external bytecode will be appended at the end. + if (&MBB != &MF.back() || &MI != &MBB.back()) { + // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block + // at the end and jump there. + if (!EmptyMBBAtEnd) { + EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); + MF.insert(MF.end(), EmptyMBBAtEnd); + } + + MBB.addSuccessor(EmptyMBBAtEnd); + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) + .addMBB(EmptyMBBAtEnd); + MI.eraseFromParent(); + MBBE = MBB.getFirstTerminator(); + } + break; default: break; } diff --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll @@ -0,0 +1,81 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -stop-after=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GCN %s +; If the block containing the SI_RETURN_TO_EPILOG is not the last block, insert an empty block at the end and +; insert an unconditional jump there. +define amdgpu_ps float @simple_test_return_to_epilog(float %a) #0 { + ; GCN-LABEL: name: simple_test_return_to_epilog + ; GCN: bb.0.entry: + ; GCN: liveins: $vgpr0 + ; GCN: SI_RETURN_TO_EPILOG killed $vgpr0 +entry: + ret float %a +} + +define amdgpu_ps float @test_return_to_epilog_into_end_block(i32 inreg %a, float %b) #0 { + ; GCN-LABEL: name: test_return_to_epilog_into_end_block + ; GCN: bb.0.entry: + ; GCN: successors: %bb.1(0x7fffffff), %bb.2(0x00000001) + ; GCN: liveins: $sgpr2, $vgpr0 + ; GCN: S_CMP_LT_I32 killed renamable $sgpr2, 1, implicit-def $scc + ; GCN: S_CBRANCH_SCC1 %bb.2, implicit killed $scc + ; GCN: bb.1.if: + ; GCN: successors: %bb.3(0x80000000) + ; GCN: liveins: $vgpr0 + ; GCN: S_BRANCH %bb.3 + ; GCN: bb.2.else: + ; GCN: successors: + ; GCN: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GCN: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; GCN: bb.3: +entry: + %cc = icmp sgt i32 %a, 0 + br i1 %cc, label %if, label %else +if: ; preds = %entry + ret float %b +else: ; preds = %entry + store volatile i32 0, i32 addrspace(1)* undef + unreachable +} + +define amdgpu_ps float @test_unify_return_to_epilog_into_end_block(i32 inreg %a, i32 inreg %b, float %c, float %d) #0 { + ; GCN-LABEL: name: test_unify_return_to_epilog_into_end_block + ; GCN: bb.0.entry: + ; GCN: successors: %bb.1(0x50000000), %bb.2(0x30000000) + ; GCN: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1 + ; GCN: S_CMP_LT_I32 killed renamable $sgpr2, 1, implicit-def $scc + ; GCN: S_CBRANCH_SCC1 %bb.2, implicit killed $scc + ; GCN: bb.1.if: + ; GCN: successors: %bb.5(0x80000000) + ; GCN: liveins: $vgpr0 + ; GCN: S_BRANCH %bb.5 + ; GCN: bb.2.else.if.cond: + ; GCN: successors: %bb.3(0x7fffffff), %bb.4(0x00000001) + ; GCN: liveins: $sgpr3, $vgpr1 + ; GCN: S_CMP_LT_I32 killed renamable $sgpr3, 1, implicit-def $scc + ; GCN: S_CBRANCH_SCC1 %bb.4, implicit killed $scc + ; GCN: bb.3.else.if: + ; GCN: successors: %bb.5(0x80000000) + ; GCN: liveins: $vgpr1 + ; GCN: $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec, implicit $exec + ; GCN: S_BRANCH %bb.5 + ; GCN: bb.4.else: + ; GCN: successors: + ; GCN: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GCN: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; GCN: bb.5: +entry: + %cc = icmp sgt i32 %a, 0 + br i1 %cc, label %if, label %else.if.cond +if: ; preds = %entry + ret float %c +else.if.cond: ; preds = %entry + %cc1 = icmp sgt i32 %b, 0 + br i1 %cc1, label %else.if, label %else +else.if: ; preds = %else.if.cond + ret float %d +else: ; preds = %else.if.cond + store volatile i32 0, i32 addrspace(1)* undef + unreachable +} + +attributes #0 = { nounwind }