diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -447,6 +447,15 @@ break; } + case AMDGPU::SI_KILL_CLEANUP: + if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS && + dominatesAllReachable(MBB)) { + KillInstrs.push_back(&MI); + } else { + MI.eraseFromParent(); + } + break; + default: break; } diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -379,6 +379,9 @@ defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>; defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>; +let Defs = [EXEC] in +def SI_KILL_CLEANUP : SPseudoInstSI <(outs), (ins)>; + let Defs = [EXEC,VCC] in def SI_ILLEGAL_COPY : SPseudoInstSI < (outs unknown:$dst), (ins unknown:$src), diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -89,8 +89,10 @@ MachineRegisterInfo *MRI = nullptr; SetVector LoweredEndCf; DenseSet LoweredIf; + SmallSet NeedsKillCleanup; const TargetRegisterClass *BoolRC = nullptr; + bool InsertKillCleanups; unsigned AndOpc; unsigned OrOpc; unsigned XorOpc; @@ -111,6 +113,8 @@ void combineMasks(MachineInstr &MI); + void process(MachineInstr &MI); + // Skip to the next instruction, ignoring debug instructions, and trivial // block boundaries (blocks that have one (typically fallthrough) successor, // and the successor has one predecessor. @@ -160,36 +164,36 @@ char &llvm::SILowerControlFlowID = SILowerControlFlow::ID; -static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI, - const SIInstrInfo *TII) { - Register SaveExecReg = MI.getOperand(0).getReg(); - auto U = MRI->use_instr_nodbg_begin(SaveExecReg); - - if (U == MRI->use_instr_nodbg_end() || - std::next(U) != MRI->use_instr_nodbg_end() || - U->getOpcode() != AMDGPU::SI_END_CF) - return false; - - // Check for SI_KILL_*_TERMINATOR on path from if to endif. - // if there is any such terminator simplififcations are not safe. - auto SMBB = MI.getParent(); - auto EMBB = U->getParent(); +static bool hasKill(const MachineBasicBlock *Begin, + const MachineBasicBlock *End, const SIInstrInfo *TII) { DenseSet Visited; - SmallVector Worklist(SMBB->succ_begin(), - SMBB->succ_end()); + SmallVector Worklist(Begin->succ_begin(), + Begin->succ_end()); while (!Worklist.empty()) { MachineBasicBlock *MBB = Worklist.pop_back_val(); - if (MBB == EMBB || !Visited.insert(MBB).second) + if (MBB == End || !Visited.insert(MBB).second) continue; - for(auto &Term : MBB->terminators()) + for (auto &Term : MBB->terminators()) if (TII->isKillTerminator(Term.getOpcode())) - return false; + return true; Worklist.append(MBB->succ_begin(), MBB->succ_end()); } + return false; +} + +static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) { + Register SaveExecReg = MI.getOperand(0).getReg(); + auto U = MRI->use_instr_nodbg_begin(SaveExecReg); + + if (U == MRI->use_instr_nodbg_end() || + std::next(U) != MRI->use_instr_nodbg_end() || + U->getOpcode() != AMDGPU::SI_END_CF) + return false; + return true; } @@ -207,7 +211,31 @@ // If there is only one use of save exec register and that use is SI_END_CF, // we can optimize SI_IF by returning the full saved exec mask instead of // just cleared bits. - bool SimpleIf = isSimpleIf(MI, MRI, TII); + bool SimpleIf = isSimpleIf(MI, MRI); + + if (InsertKillCleanups) { + // Check for SI_KILL_*_TERMINATOR on full path of control flow and + // flag the associated SI_END_CF for insertion of a kill cleanup. + auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg); + while (UseMI->getOpcode() != AMDGPU::SI_END_CF) { + assert(std::next(UseMI) == MRI->use_instr_nodbg_end()); + MachineOperand &NextExec = UseMI->getOperand(0); + Register NextExecReg = NextExec.getReg(); + if (NextExec.isDead()) + break; + UseMI = MRI->use_instr_nodbg_begin(NextExecReg); + } + if (hasKill(MI.getParent(), UseMI->getParent(), TII)) { + if (UseMI->getOpcode() == AMDGPU::SI_END_CF) + NeedsKillCleanup.insert(&*UseMI); + SimpleIf = false; + } + } else if (SimpleIf) { + // Check for SI_KILL_*_TERMINATOR on path from if to endif. + // if there is any such terminator simplifications are not safe. + auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg); + SimpleIf = !hasKill(MI.getParent(), UseMI->getParent(), TII); + } // Add an implicit def of exec to discourage scheduling VALU after this which // will interfere with trying to form s_and_saveexec_b64 later. @@ -427,6 +455,8 @@ auto E = B->end(); for ( ; It != E; ++It) { + if (It->getOpcode() == AMDGPU::SI_KILL_CLEANUP) + continue; if (TII->mayReadEXEC(*MRI, *It)) break; } @@ -461,8 +491,18 @@ LoweredEndCf.insert(NewMI); - if (LIS) + // If this ends control flow which contains kills (as flagged in emitIf) + // then insert an SI_KILL_CLEANUP immediately following the exec mask + // manipulation. This can be lowered to early termination if appropriate. + MachineInstr *CleanUpMI = nullptr; + if (NeedsKillCleanup.count(&MI)) + CleanUpMI = BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_KILL_CLEANUP)); + + if (LIS) { LIS->ReplaceMachineInstrInMaps(MI, *NewMI); + if (CleanUpMI) + LIS->InsertMachineInstrInMaps(*CleanUpMI); + } MI.eraseFromParent(); @@ -553,6 +593,56 @@ } } +void SILowerControlFlow::process(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + MachineBasicBlock::iterator I(MI); + MachineInstr *Prev = (I != MBB.begin()) ? &*(std::prev(I)) : nullptr; + + switch (MI.getOpcode()) { + case AMDGPU::SI_IF: + emitIf(MI); + break; + + case AMDGPU::SI_ELSE: + emitElse(MI); + break; + + case AMDGPU::SI_IF_BREAK: + emitIfBreak(MI); + break; + + case AMDGPU::SI_LOOP: + emitLoop(MI); + break; + + case AMDGPU::SI_END_CF: + emitEndCf(MI); + break; + + default: + assert(false && "Attempt to process unsupported instruction"); + break; + } + + MachineBasicBlock::iterator Next; + for (I = Prev ? Prev->getIterator() : MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + MachineInstr &MaskMI = *I; + switch (MaskMI.getOpcode()) { + case AMDGPU::S_AND_B64: + case AMDGPU::S_OR_B64: + case AMDGPU::S_AND_B32: + case AMDGPU::S_OR_B32: + // Cleanup bit manipulations on exec mask + combineMasks(MaskMI); + break; + default: + I = MBB.end(); + break; + } + } +} + bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); @@ -562,6 +652,8 @@ LIS = getAnalysisIfAvailable(); MRI = &MF.getRegInfo(); BoolRC = TRI->getBoolRC(); + InsertKillCleanups = + MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS; if (ST.isWave32()) { AndOpc = AMDGPU::S_AND_B32; @@ -583,62 +675,49 @@ Exec = AMDGPU::EXEC; } + SmallVector Worklist; + MachineFunction::iterator NextBB; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; BI = NextBB) { NextBB = std::next(BI); MachineBasicBlock &MBB = *BI; - MachineBasicBlock::iterator I, Next, Last; - - for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) { + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; switch (MI.getOpcode()) { case AMDGPU::SI_IF: - emitIf(MI); + process(MI); break; case AMDGPU::SI_ELSE: - emitElse(MI); - break; - case AMDGPU::SI_IF_BREAK: - emitIfBreak(MI); - break; - case AMDGPU::SI_LOOP: - emitLoop(MI); - break; - case AMDGPU::SI_END_CF: - emitEndCf(MI); + // Only build worklist if SI_IF instructions must be processed first. + if (InsertKillCleanups) + Worklist.push_back(&MI); + else + process(MI); break; - case AMDGPU::S_AND_B64: - case AMDGPU::S_OR_B64: - case AMDGPU::S_AND_B32: - case AMDGPU::S_OR_B32: - // Cleanup bit manipulations on exec mask - combineMasks(MI); - Last = I; - continue; - default: - Last = I; - continue; + break; } - - // Replay newly inserted code to combine masks - Next = (Last == MBB.end()) ? MBB.begin() : Last; } } + for (MachineInstr *MI : Worklist) + process(*MI); + optimizeEndCf(); LoweredEndCf.clear(); LoweredIf.clear(); + NeedsKillCleanup.clear(); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll --- a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll @@ -61,9 +61,11 @@ br label %loop } -; In case there's an epilog, we shouldn't have to do this. +; Check that the epilog is the final block ; CHECK-LABEL: return_nonvoid -; CHECK-NOT: exp null off, off, off, off done vm +; CHECK: exp null off, off, off, off done vm +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: BB{{[0-9]+}}_{{[0-9]+}}: define amdgpu_ps float @return_nonvoid(float %0) #0 { main_body: %cmp = fcmp olt float %0, 1.000000e+01 diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -470,7 +470,11 @@ } ; CHECK-LABEL: {{^}}cbranch_kill: -; CHECK-NOT: exp null off, off, off, off done vm +; CHECK: ; %bb.{{[0-9]+}}: ; %export +; CHECK-NEXT: s_or_b64 +; CHECK-NEXT: s_cbranch_execz [[EXIT:BB[0-9]+_[0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: exp null off, off, off, off done vm define amdgpu_ps void @cbranch_kill(i32 inreg %0, <2 x float> %1) { .entry: %val0 = extractelement <2 x float> %1, i32 0