diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -69,6 +69,7 @@ DebugLoc DL); bool kill(MachineInstr &MI); + void earlyTerm(MachineInstr &MI); bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB); @@ -165,19 +166,22 @@ return true; } -static void generatePsEndPgm(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, DebugLoc DL, - const SIInstrInfo *TII) { - // Generate "null export; s_endpgm". - BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE)) - .addImm(0x09) // V_008DFC_SQ_EXP_NULL - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addImm(1) // vm - .addImm(0) // compr - .addImm(0); // en +static void generateEndPgm(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + const SIInstrInfo *TII, bool isPS) { + // "null export" + if (isPS) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE)) + .addImm(0x09) // V_008DFC_SQ_EXP_NULL + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addImm(1) // vm + .addImm(0) // compr + .addImm(0); // en + } + // s_endpgm BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0); } @@ -189,7 +193,9 @@ if (!EarlyExitBlock) { EarlyExitBlock = MF->CreateMachineBasicBlock(); MF->insert(MF->end(), EarlyExitBlock); - generatePsEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII); + generateEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII, + MF->getFunction().getCallingConv() == + CallingConv::AMDGPU_PS); EarlyExitClearsExec = false; } @@ -198,7 +204,6 @@ unsigned Mov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; Register Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; auto ExitI = EarlyExitBlock->getFirstNonPHI(); - assert(ExitI->getOpcode() == AMDGPU::EXP_DONE); BuildMI(*EarlyExitBlock, ExitI, DL, TII->get(Mov), Exec).addImm(0); EarlyExitClearsExec = true; } @@ -244,7 +249,7 @@ llvm::find(MBB.successors(), &*NextBBI) == MBB.succ_end(); if (NoSuccessor) { - generatePsEndPgm(MBB, I, DL, TII); + generateEndPgm(MBB, I, DL, TII, true); } else { ensureEarlyExitBlock(MBB, false); @@ -388,6 +393,25 @@ } } +void SIInsertSkips::earlyTerm(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + ensureEarlyExitBlock(MBB, true); + + auto BranchMI = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0)) + .addMBB(EarlyExitBlock); + auto Next = std::next(MI.getIterator()); + + if (Next != MBB.end() && !Next->isTerminator()) + splitBlock(MBB, *BranchMI, MDT); + + MBB.addSuccessor(EarlyExitBlock); + MDT->getBase().insertEdge(&MBB, EarlyExitBlock); + + MI.eraseFromParent(); +} + // Returns true if a branch over the block was inserted. bool SIInsertSkips::skipMaskBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB) { @@ -413,6 +437,7 @@ SkipThreshold = SkipThresholdFlag; SmallVector KillInstrs; + SmallVector EarlyTermInstrs; bool MadeChange = false; for (MachineBasicBlock &MBB : MF) { @@ -471,18 +496,30 @@ } break; + case AMDGPU::SI_EARLY_TERMINATE_SCC0: + EarlyTermInstrs.push_back(&MI); + break; + default: break; } } } + for (MachineInstr *Instr : EarlyTermInstrs) { + // Early termination in GS does nothing + if (MF.getFunction().getCallingConv() != CallingConv::AMDGPU_GS) + earlyTerm(*Instr); + else + Instr->eraseFromParent(); + } for (MachineInstr *Kill : KillInstrs) { skipIfDead(*Kill->getParent(), std::next(Kill->getIterator()), Kill->getDebugLoc()); Kill->eraseFromParent(); } KillInstrs.clear(); + EarlyTermInstrs.clear(); EarlyExitBlock = nullptr; return MadeChange; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -357,6 +357,14 @@ let isReMaterializable = 1; } +// Branch to the early termination block of the shader if SCC is 0. +// This uses SCC from a previous SALU operation, i.e. the update of +// a mask of live lanes after a kill/demote operation. +// Only valid in pixel shaders. +def SI_EARLY_TERMINATE_SCC0 : SPseudoInstSI <(outs), (ins)> { + let Uses = [EXEC,SCC]; +} + let Uses = [EXEC] in { multiclass PseudoInstKill { diff --git a/llvm/test/CodeGen/AMDGPU/early-term.mir b/llvm/test/CodeGen/AMDGPU/early-term.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/early-term.mir @@ -0,0 +1,262 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-insert-skips -verify-machineinstrs %s -o - | FileCheck %s + +--- | + define amdgpu_ps void @early_term_scc0_end_block() { + ret void + } + + define amdgpu_ps void @early_term_scc0_next_terminator() { + ret void + } + + define amdgpu_ps void @early_term_scc0_in_block() { + ret void + } + + define amdgpu_ps void @early_term_scc0_with_kill() { + ret void + } + + define amdgpu_cs void @early_term_scc0_cs() { + ret void + } + + define amdgpu_gs void @early_term_scc0_gs() { + ret void + } +... + +--- +name: early_term_scc0_end_block +tracksRegLiveness: true +liveins: + - { reg: '$sgpr0' } +body: | + ; CHECK-LABEL: name: early_term_scc0_end_block + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000), %bb.2(0x00000000) + ; CHECK: liveins: $sgpr0 + ; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: S_CMP_EQ_U32 killed $sgpr0, 0, implicit-def $scc + ; CHECK: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; CHECK: bb.1: + ; CHECK: liveins: $vgpr0 + ; CHECK: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + ; CHECK: S_ENDPGM 0 + ; CHECK: bb.2: + ; CHECK: $exec_lo = S_MOV_B32 0 + ; CHECK: EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec + ; CHECK: S_ENDPGM 0 + bb.0: + liveins: $sgpr0 + successors: %bb.1 + + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + S_CMP_EQ_U32 killed $sgpr0, 0, implicit-def $scc + SI_EARLY_TERMINATE_SCC0 implicit $scc, implicit $exec + + bb.1: + liveins: $vgpr0 + EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + S_ENDPGM 0 +... + +--- +name: early_term_scc0_next_terminator +tracksRegLiveness: true +liveins: + - { reg: '$sgpr0' } +body: | + ; CHECK-LABEL: name: early_term_scc0_next_terminator + ; CHECK: bb.0: + ; CHECK: successors: %bb.2(0x80000000), %bb.3(0x00000000) + ; CHECK: liveins: $sgpr0 + ; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: S_CMP_EQ_U32 killed $sgpr0, 0, implicit-def $scc + ; CHECK: S_CBRANCH_SCC0 %bb.3, implicit $scc + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; CHECK: bb.2: + ; CHECK: liveins: $vgpr0 + ; CHECK: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + ; CHECK: S_ENDPGM 0 + ; CHECK: bb.3: + ; CHECK: $exec_lo = S_MOV_B32 0 + ; CHECK: EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec + ; CHECK: S_ENDPGM 0 + bb.0: + liveins: $sgpr0 + successors: %bb.2 + + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + S_CMP_EQ_U32 killed $sgpr0, 0, implicit-def $scc + SI_EARLY_TERMINATE_SCC0 implicit $scc, implicit $exec + S_BRANCH %bb.2 + + bb.1: + successors: %bb.2 + $vgpr0 = V_MOV_B32_e32 1, implicit $exec + S_BRANCH %bb.2 + + bb.2: + liveins: $vgpr0 + EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + S_ENDPGM 0 +... + +--- +name: early_term_scc0_in_block +tracksRegLiveness: true +liveins: + - { reg: '$sgpr0' } +body: | + ; CHECK-LABEL: name: early_term_scc0_in_block + ; CHECK: bb.0: + ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK: liveins: $sgpr0 + ; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: S_CMP_EQ_U32 killed $sgpr0, 0, implicit-def $scc + ; CHECK: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; CHECK: bb.3: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $vgpr0, $scc + ; CHECK: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; CHECK: bb.1: + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK: EXP 1, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; CHECK: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + ; CHECK: S_ENDPGM 0 + ; CHECK: bb.2: + ; CHECK: $exec_lo = S_MOV_B32 0 + ; CHECK: EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec + ; CHECK: S_ENDPGM 0 + bb.0: + liveins: $sgpr0 + successors: %bb.1 + + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + S_CMP_EQ_U32 killed $sgpr0, 0, implicit-def $scc + SI_EARLY_TERMINATE_SCC0 implicit $scc, implicit $exec + $vgpr1 = V_MOV_B32_e32 1, implicit $exec + + bb.1: + liveins: $vgpr0, $vgpr1 + EXP 1, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + S_ENDPGM 0 +... + +--- +name: early_term_scc0_with_kill +tracksRegLiveness: true +liveins: + - { reg: '$sgpr0' } + - { reg: '$vgpr2' } +body: | + ; CHECK-LABEL: name: early_term_scc0_with_kill + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000), %bb.3(0x00000000) + ; CHECK: liveins: $sgpr0, $vgpr2 + ; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: V_CMPX_LE_F32_nosdst_e32 0, killed $vgpr2, implicit-def $exec, implicit $mode, implicit $exec + ; CHECK: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; CHECK: bb.1: + ; CHECK: successors: %bb.4(0x40000000), %bb.3(0x40000000) + ; CHECK: liveins: $sgpr0, $vgpr0 + ; CHECK: S_CMP_EQ_U32 killed $sgpr0, 0, implicit-def $scc + ; CHECK: S_CBRANCH_SCC0 %bb.3, implicit $scc + ; CHECK: bb.4: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $vgpr0, $scc + ; CHECK: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; CHECK: bb.2: + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK: EXP 1, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; CHECK: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + ; CHECK: S_ENDPGM 0 + ; CHECK: bb.3: + ; CHECK: $exec_lo = S_MOV_B32 0 + ; CHECK: EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec + ; CHECK: S_ENDPGM 0 + bb.0: + liveins: $sgpr0, $vgpr2 + successors: %bb.1 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + SI_KILL_F32_COND_IMM_TERMINATOR killed $vgpr2, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + + bb.1: + liveins: $sgpr0, $vgpr0 + successors: %bb.2 + S_CMP_EQ_U32 killed $sgpr0, 0, implicit-def $scc + SI_EARLY_TERMINATE_SCC0 implicit $scc, implicit $exec + $vgpr1 = V_MOV_B32_e32 1, implicit $exec + + bb.2: + liveins: $vgpr0, $vgpr1 + EXP 1, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + S_ENDPGM 0 +... + +--- +name: early_term_scc0_gs +tracksRegLiveness: true +liveins: + - { reg: '$sgpr0' } +body: | + ; CHECK-LABEL: name: early_term_scc0_gs + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $sgpr0 + ; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: S_CMP_EQ_U32 killed $sgpr0, 0, implicit-def $scc + ; CHECK: bb.1: + ; CHECK: liveins: $vgpr0 + ; CHECK: S_ENDPGM 0 + bb.0: + liveins: $sgpr0 + successors: %bb.1 + + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + S_CMP_EQ_U32 killed $sgpr0, 0, implicit-def $scc + SI_EARLY_TERMINATE_SCC0 implicit $scc, implicit $exec + + bb.1: + liveins: $vgpr0 + S_ENDPGM 0 +... + +--- +name: early_term_scc0_cs +tracksRegLiveness: true +liveins: + - { reg: '$sgpr0' } +body: | + ; CHECK-LABEL: name: early_term_scc0_cs + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000), %bb.2(0x00000000) + ; CHECK: liveins: $sgpr0 + ; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: S_CMP_EQ_U32 killed $sgpr0, 0, implicit-def $scc + ; CHECK: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; CHECK: bb.1: + ; CHECK: liveins: $vgpr0 + ; CHECK: S_ENDPGM 0 + ; CHECK: bb.2: + ; CHECK: $exec_lo = S_MOV_B32 0 + ; CHECK: S_ENDPGM 0 + bb.0: + liveins: $sgpr0 + successors: %bb.1 + + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + S_CMP_EQ_U32 killed $sgpr0, 0, implicit-def $scc + SI_EARLY_TERMINATE_SCC0 implicit $scc, implicit $exec + + bb.1: + liveins: $vgpr0 + S_ENDPGM 0 +...