Index: lib/Target/AMDGPU/SIInsertWaits.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertWaits.cpp +++ lib/Target/AMDGPU/SIInsertWaits.cpp @@ -558,14 +558,6 @@ // Wait for everything at the end of the MBB Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); - - // Functions returning something shouldn't contain S_ENDPGM, because other - // bytecode will be appended after it. - if (!ReturnsVoid) { - MachineBasicBlock::iterator I = MBB.getFirstTerminator(); - if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM) - I->eraseFromParent(); - } } return Changes; Index: lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- lib/Target/AMDGPU/SILowerControlFlow.cpp +++ lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -486,6 +486,7 @@ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { + MachineBasicBlock *EmptyMBBAtEnd = NULL; MachineBasicBlock &MBB = *BI; MachineBasicBlock::iterator I, Next; for (I = MBB.begin(); I != MBB.end(); I = Next) { @@ -562,6 +563,29 @@ case AMDGPU::SI_INDIRECT_DST_V16: IndirectDst(MI); break; + + case AMDGPU::S_ENDPGM: { + if (MF.getInfo()->returnsVoid()) + break; + + // Graphics shaders returning non-void shouldn't contain S_ENDPGM, + // because external bytecode will be appended at the end. + if (BI != --MF.end() || I != MBB.getFirstTerminator()) { + // S_ENDPGM is not the last instruction. Add an empty block at + // the end and jump there. + if (!EmptyMBBAtEnd) { + EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); + MF.insert(MF.end(), EmptyMBBAtEnd); + } + + MBB.addSuccessor(EmptyMBBAtEnd); + BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) + .addMBB(EmptyMBBAtEnd); + } + + I->eraseFromParent(); + break; + } } } }