diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -44,12 +44,8 @@ bool shouldSkip(const MachineBasicBlock &From, const MachineBasicBlock &To) const; - bool dominatesAllReachable(MachineBasicBlock &MBB); void ensureEarlyExitBlock(MachineBasicBlock &MBB, bool ClearExec); - void skipIfDead(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - DebugLoc DL); - bool kill(MachineInstr &MI); void earlyTerm(MachineInstr &MI); bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB); @@ -57,6 +53,9 @@ public: static char ID; + unsigned MovOpc; + Register ExecReg; + SIInsertSkips() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -138,15 +137,6 @@ return false; } -/// Check whether \p MBB dominates all blocks that are reachable from it. -bool SIInsertSkips::dominatesAllReachable(MachineBasicBlock &MBB) { - for (MachineBasicBlock *Other : depth_first(&MBB)) { - if (!MDT->dominates(&MBB, Other)) - return false; - } - return true; -} - static void generateEndPgm(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, const SIInstrInfo *TII, bool IsPS) { @@ -181,11 +171,8 @@ } if (ClearExec && !EarlyExitClearsExec) { - const GCNSubtarget &ST = MF->getSubtarget(); - unsigned Mov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - Register Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; auto ExitI = EarlyExitBlock->getFirstNonPHI(); - BuildMI(*EarlyExitBlock, ExitI, DL, TII->get(Mov), Exec).addImm(0); + BuildMI(*EarlyExitBlock, ExitI, DL, TII->get(MovOpc), ExecReg).addImm(0); EarlyExitClearsExec = true; } } @@ -205,175 +192,6 @@ MDT->getBase().applyUpdates(DTUpdates); } -/// Insert an "if exec=0 { null export; s_endpgm }" sequence before the given -/// iterator. Only applies to pixel shaders. -void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, DebugLoc DL) { - MachineFunction *MF = MBB.getParent(); - (void)MF; - assert(MF->getFunction().getCallingConv() == CallingConv::AMDGPU_PS); - - // It is possible for an SI_KILL_*_TERMINATOR to sit at the bottom of a - // basic block that has no further successors (e.g., there was an - // `unreachable` there in IR). This can happen with original source of the - // form: - // - // if (uniform_condition) { - // write_to_memory(); - // discard; - // } - // - // In this case, we write the "null_export; s_endpgm" skip code in the - // already-existing basic block. - auto NextBBI = std::next(MBB.getIterator()); - bool NoSuccessor = - I == MBB.end() && !llvm::is_contained(MBB.successors(), &*NextBBI); - - if (NoSuccessor) { - generateEndPgm(MBB, I, DL, TII, true); - } else { - ensureEarlyExitBlock(MBB, false); - - MachineInstr *BranchMI = - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) - .addMBB(EarlyExitBlock); - - // Split the block if the branch will not come at the end. - auto Next = std::next(BranchMI->getIterator()); - if (Next != MBB.end() && !Next->isTerminator()) - splitBlock(MBB, *BranchMI, MDT); - - MBB.addSuccessor(EarlyExitBlock); - MDT->getBase().insertEdge(&MBB, EarlyExitBlock); - } -} - -/// Translate a SI_KILL_*_TERMINATOR into exec-manipulating instructions. -/// Return true unless the terminator is a no-op. -bool SIInsertSkips::kill(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - - switch (MI.getOpcode()) { - case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: { - unsigned Opcode = 0; - - // The opcodes are inverted because the inline immediate has to be - // the first operand, e.g. from "x < imm" to "imm > x" - switch (MI.getOperand(2).getImm()) { - case ISD::SETOEQ: - case ISD::SETEQ: - Opcode = AMDGPU::V_CMPX_EQ_F32_e64; - break; - case ISD::SETOGT: - case ISD::SETGT: - Opcode = AMDGPU::V_CMPX_LT_F32_e64; - break; - case ISD::SETOGE: - case ISD::SETGE: - Opcode = AMDGPU::V_CMPX_LE_F32_e64; - break; - case ISD::SETOLT: - case ISD::SETLT: - Opcode = AMDGPU::V_CMPX_GT_F32_e64; - break; - case ISD::SETOLE: - case ISD::SETLE: - Opcode = AMDGPU::V_CMPX_GE_F32_e64; - break; - case ISD::SETONE: - case ISD::SETNE: - Opcode = AMDGPU::V_CMPX_LG_F32_e64; - break; - case ISD::SETO: - Opcode = AMDGPU::V_CMPX_O_F32_e64; - break; - case ISD::SETUO: - Opcode = AMDGPU::V_CMPX_U_F32_e64; - break; - case ISD::SETUEQ: - Opcode = AMDGPU::V_CMPX_NLG_F32_e64; - break; - case ISD::SETUGT: - Opcode = AMDGPU::V_CMPX_NGE_F32_e64; - break; - case ISD::SETUGE: - Opcode = AMDGPU::V_CMPX_NGT_F32_e64; - break; - case ISD::SETULT: - Opcode = AMDGPU::V_CMPX_NLE_F32_e64; - break; - case ISD::SETULE: - Opcode = AMDGPU::V_CMPX_NLT_F32_e64; - break; - case ISD::SETUNE: - Opcode = AMDGPU::V_CMPX_NEQ_F32_e64; - break; - default: - llvm_unreachable("invalid ISD:SET cond code"); - } - - const GCNSubtarget &ST = MBB.getParent()->getSubtarget(); - if (ST.hasNoSdstCMPX()) - Opcode = AMDGPU::getVCMPXNoSDstOp(Opcode); - - assert(MI.getOperand(0).isReg()); - - if (TRI->isVGPR(MBB.getParent()->getRegInfo(), - MI.getOperand(0).getReg())) { - Opcode = AMDGPU::getVOPe32(Opcode); - BuildMI(MBB, &MI, DL, TII->get(Opcode)) - .add(MI.getOperand(1)) - .add(MI.getOperand(0)); - } else { - auto I = BuildMI(MBB, &MI, DL, TII->get(Opcode)); - if (!ST.hasNoSdstCMPX()) - I.addReg(AMDGPU::VCC, RegState::Define); - - I.addImm(0) // src0 modifiers - .add(MI.getOperand(1)) - .addImm(0) // src1 modifiers - .add(MI.getOperand(0)); - - I.addImm(0); // omod - } - return true; - } - case AMDGPU::SI_KILL_I1_TERMINATOR: { - const MachineFunction *MF = MI.getParent()->getParent(); - const GCNSubtarget &ST = MF->getSubtarget(); - unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - const MachineOperand &Op = MI.getOperand(0); - int64_t KillVal = MI.getOperand(1).getImm(); - assert(KillVal == 0 || KillVal == -1); - - // Kill all threads if Op0 is an immediate and equal to the Kill value. - if (Op.isImm()) { - int64_t Imm = Op.getImm(); - assert(Imm == 0 || Imm == -1); - - if (Imm == KillVal) { - BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_MOV_B32 - : AMDGPU::S_MOV_B64), Exec) - .addImm(0); - return true; - } - return false; - } - - unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64; - if (ST.isWave32()) - Opcode = KillVal ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_AND_B32; - BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec) - .addReg(Exec) - .add(Op); - return true; - } - default: - llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR"); - } -} - void SIInsertSkips::earlyTerm(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc DL = MI.getDebugLoc(); @@ -415,7 +233,9 @@ MDT = &getAnalysis(); SkipThreshold = SkipThresholdFlag; - SmallVector KillInstrs; + MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + SmallVector EarlyTermInstrs; bool MadeChange = false; @@ -440,41 +260,6 @@ } break; - case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: - case AMDGPU::SI_KILL_I1_TERMINATOR: { - MadeChange = true; - bool CanKill = kill(MI); - - // Check if we can add an early "if exec=0 { end shader }". - // - // Note that we _always_ do this if it is correct, even if the kill - // happens fairly late in the shader, because the null export should - // generally still be cheaper than normal export(s). - // - // TODO: The dominatesAllReachable check is conservative: if the - // dominance is only missing due to _uniform_ branches, we could - // in fact insert the early-exit as well. - if (CanKill && - MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS && - dominatesAllReachable(MBB)) { - // Mark the instruction for kill-if-dead insertion. We delay this - // change because it modifies the CFG. - KillInstrs.push_back(&MI); - } else { - MI.eraseFromParent(); - } - break; - } - - case AMDGPU::SI_KILL_CLEANUP: - if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS && - dominatesAllReachable(MBB)) { - KillInstrs.push_back(&MI); - } else { - MI.eraseFromParent(); - } - break; - case AMDGPU::SI_EARLY_TERMINATE_SCC0: EarlyTermInstrs.push_back(&MI); break; @@ -491,12 +276,6 @@ earlyTerm(*Instr); Instr->eraseFromParent(); } - for (MachineInstr *Kill : KillInstrs) { - skipIfDead(*Kill->getParent(), std::next(Kill->getIterator()), - Kill->getDebugLoc()); - Kill->eraseFromParent(); - } - KillInstrs.clear(); EarlyTermInstrs.clear(); EarlyExitBlock = nullptr; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1640,6 +1640,18 @@ MI.setDesc(get(AMDGPU::S_ANDN2_B32)); break; + case AMDGPU::S_AND_B64_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_AND_B64)); + break; + + case AMDGPU::S_AND_B32_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_AND_B32)); + break; + case AMDGPU::V_MOV_B64_PSEUDO: { Register Dst = MI.getOperand(0).getReg(); Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); @@ -2271,10 +2283,12 @@ case AMDGPU::S_XOR_B64_term: case AMDGPU::S_OR_B64_term: case AMDGPU::S_ANDN2_B64_term: + case AMDGPU::S_AND_B64_term: case AMDGPU::S_MOV_B32_term: case AMDGPU::S_XOR_B32_term: case AMDGPU::S_OR_B32_term: case AMDGPU::S_ANDN2_B32_term: + case AMDGPU::S_AND_B32_term: break; case AMDGPU::SI_IF: case AMDGPU::SI_ELSE: diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -230,6 +230,7 @@ def S_XOR_B64_term : WrapTerminatorInst; def S_OR_B64_term : WrapTerminatorInst; def S_ANDN2_B64_term : WrapTerminatorInst; +def S_AND_B64_term : WrapTerminatorInst; } let WaveSizePredicate = isWave32 in { @@ -237,6 +238,7 @@ def S_XOR_B32_term : WrapTerminatorInst; def S_OR_B32_term : WrapTerminatorInst; def S_ANDN2_B32_term : WrapTerminatorInst; +def S_AND_B32_term : WrapTerminatorInst; } @@ -337,24 +339,22 @@ // required in degenerate cases (when V_CMPX cannot be used due to constant // bus limitations) and because it allows us to avoid having to track SCC // liveness across basic blocks. - let Defs = [EXEC,VCC,SCC] in + let Defs = [EXEC,SCC] in def _PSEUDO : PseudoInstSI <(outs), ins> { let isConvergent = 1; let usesCustomInserter = 1; } - let Defs = [EXEC,VCC,SCC] in + let Defs = [EXEC,SCC] in def _TERMINATOR : SPseudoInstSI <(outs), ins> { let isTerminator = 1; } } defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>; +let Defs = [VCC] in defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>; -let Defs = [EXEC] in -def SI_KILL_CLEANUP : SPseudoInstSI <(outs), (ins)>; - let Defs = [EXEC,VCC] in def SI_ILLEGAL_COPY : SPseudoInstSI < (outs unknown:$dst), (ins unknown:$src), diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -72,10 +72,8 @@ MachineRegisterInfo *MRI = nullptr; SetVector LoweredEndCf; DenseSet LoweredIf; - SmallSet NeedsKillCleanup; const TargetRegisterClass *BoolRC = nullptr; - bool InsertKillCleanups; unsigned AndOpc; unsigned OrOpc; unsigned XorOpc; @@ -209,28 +207,7 @@ // just cleared bits. bool SimpleIf = isSimpleIf(MI, MRI); - if (InsertKillCleanups) { - // Check for SI_KILL_*_TERMINATOR on full path of control flow and - // flag the associated SI_END_CF for insertion of a kill cleanup. - auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg); - while (UseMI->getOpcode() != AMDGPU::SI_END_CF) { - assert(std::next(UseMI) == MRI->use_instr_nodbg_end()); - assert(UseMI->getOpcode() == AMDGPU::SI_ELSE); - MachineOperand &NextExec = UseMI->getOperand(0); - Register NextExecReg = NextExec.getReg(); - if (NextExec.isDead()) { - assert(!SimpleIf); - break; - } - UseMI = MRI->use_instr_nodbg_begin(NextExecReg); - } - if (UseMI->getOpcode() == AMDGPU::SI_END_CF) { - if (hasKill(MI.getParent(), UseMI->getParent(), TII)) { - NeedsKillCleanup.insert(&*UseMI); - SimpleIf = false; - } - } - } else if (SimpleIf) { + if (SimpleIf) { // Check for SI_KILL_*_TERMINATOR on path from if to endif. // if there is any such terminator simplifications are not safe. auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg); @@ -449,8 +426,6 @@ auto E = B->end(); for ( ; It != E; ++It) { - if (It->getOpcode() == AMDGPU::SI_KILL_CLEANUP) - continue; if (TII->mayReadEXEC(*MRI, *It)) break; } @@ -503,18 +478,8 @@ LoweredEndCf.insert(NewMI); - // If this ends control flow which contains kills (as flagged in emitIf) - // then insert an SI_KILL_CLEANUP immediately following the exec mask - // manipulation. This can be lowered to early termination if appropriate. - MachineInstr *CleanUpMI = nullptr; - if (NeedsKillCleanup.count(&MI)) - CleanUpMI = BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_KILL_CLEANUP)); - - if (LIS) { + if (LIS) LIS->ReplaceMachineInstrInMaps(MI, *NewMI); - if (CleanUpMI) - LIS->InsertMachineInstrInMaps(*CleanUpMI); - } MI.eraseFromParent(); @@ -725,8 +690,6 @@ LIS = getAnalysisIfAvailable(); MRI = &MF.getRegInfo(); BoolRC = TRI->getBoolRC(); - InsertKillCleanups = - MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS; if (ST.isWave32()) { AndOpc = AMDGPU::S_AND_B32; @@ -750,8 +713,6 @@ Exec = AMDGPU::EXEC; } - SmallVector Worklist; - MachineFunction::iterator NextBB; for (MachineFunction::iterator BI = MF.begin(); BI != MF.end(); BI = NextBB) { @@ -775,10 +736,7 @@ case AMDGPU::SI_LOOP: case AMDGPU::SI_END_CF: // Only build worklist if SI_IF instructions must be processed first. - if (InsertKillCleanups) - Worklist.push_back(&MI); - else - SplitMBB = process(MI); + SplitMBB = process(MI); break; default: @@ -792,14 +750,10 @@ } } - for (MachineInstr *MI : Worklist) - process(*MI); - optimizeEndCf(); LoweredEndCf.clear(); LoweredIf.clear(); - NeedsKillCleanup.clear(); return true; } diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -220,6 +220,18 @@ MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32)); return true; } + case AMDGPU::S_AND_B64_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII.get(AMDGPU::S_AND_B64)); + return true; + } + case AMDGPU::S_AND_B32_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII.get(AMDGPU::S_AND_B32)); + return true; + } default: return false; } diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -11,10 +11,8 @@ /// shaders, and whole wavefront mode for all programs. /// /// Whole quad mode is required for derivative computations, but it interferes -/// with shader side effects (stores and atomics). This pass is run on the -/// scheduled machine IR but before register coalescing, so that machine SSA is -/// available for analysis. It ensures that WQM is enabled when necessary, but -/// disabled around stores and atomics. +/// with shader side effects (stores and atomics). It ensures that WQM is +/// enabled when necessary, but disabled around stores and atomics. /// /// When necessary, this pass creates a function prolog /// @@ -62,8 +60,10 @@ #include "llvm/ADT/PostOrderIterator.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/IR/CallingConv.h" #include "llvm/InitializePasses.h" #include "llvm/Support/raw_ostream.h" @@ -116,6 +116,8 @@ char Needs = 0; char InNeeds = 0; char OutNeeds = 0; + char InitialState = 0; + bool NeedsLowering = false; }; struct WorkItem { @@ -129,23 +131,33 @@ class SIWholeQuadMode : public MachineFunctionPass { private: - CallingConv::ID CallingConv; const SIInstrInfo *TII; const SIRegisterInfo *TRI; const GCNSubtarget *ST; MachineRegisterInfo *MRI; LiveIntervals *LIS; + MachineDominatorTree *MDT; + MachinePostDominatorTree *PDT; unsigned AndOpc; - unsigned XorTermrOpc; + unsigned AndN2Opc; + unsigned XorOpc; + unsigned AndSaveExecOpc; unsigned OrSaveExecOpc; - unsigned Exec; + unsigned WQMOpc; + Register Exec; + Register LiveMaskReg; DenseMap Instructions; MapVector Blocks; - SmallVector LiveMaskQueries; + + // Tracks state (WQM/WWM/Exact) after a given instruction + DenseMap StateTransition; + + SmallVector LiveMaskQueries; SmallVector LowerToMovInstrs; SmallVector LowerToCopyInstrs; + SmallVector KillInstrs; void printInfo(); @@ -167,17 +179,26 @@ MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC); void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SaveWQM, unsigned LiveMaskReg); + Register SaveWQM); void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SavedWQM); + Register SavedWQM); void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SaveOrig); + Register SaveOrig); void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SavedOrig); - void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry); + Register SavedOrig, char NonWWMState); + + MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI); - void lowerLiveMaskQueries(unsigned LiveMaskReg); + MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI, + bool IsWQM); + MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI); + + void lowerBlock(MachineBasicBlock &MBB); + void processBlock(MachineBasicBlock &MBB, bool IsEntry); + + void lowerLiveMaskQueries(); void lowerCopyInstrs(); + void lowerKillInstrs(bool IsWQM); public: static char ID; @@ -193,7 +214,10 @@ AU.addRequired(); AU.addPreserved(); AU.addPreserved(); - AU.setPreservesCFG(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -205,6 +229,8 @@ INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, false) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, false) @@ -262,8 +288,6 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg, unsigned SubReg, char Flag, std::vector &Worklist) { - assert(!MRI->isSSA()); - LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI); LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI)); @@ -342,28 +366,14 @@ if (!Value) continue; - if (MRI->isSSA()) { - // Since we're in machine SSA, we do not need to track physical - // registers across basic blocks. - if (Value->isPHIDef()) - continue; - markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag, - Worklist); - } else { - markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist); - } + markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist); } continue; } - if (MRI->isSSA()) { - for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) - markInstruction(DefMI, Flag, Worklist); - } else { - LiveRange &LR = LIS->getInterval(Reg); - markDefs(MI, LR, Reg, Use.getSubReg(), Flag, Worklist); - } + LiveRange &LR = LIS->getInterval(Reg); + markDefs(MI, LR, Reg, Use.getSubReg(), Flag, Worklist); } } @@ -444,10 +454,15 @@ } else { if (Opcode == AMDGPU::SI_PS_LIVE) { LiveMaskQueries.push_back(&MI); + } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR || + Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR) { + KillInstrs.push_back(&MI); + BBI.NeedsLowering = true; } else if (WQMOutputs) { // The function is in machine SSA form, which means that physical // VGPRs correspond to shader inputs and outputs. Inputs are // only used, outputs are only defined. + // FIXME: is this still valid? for (const MachineOperand &MO : MI.defs()) { if (!MO.isReg()) continue; @@ -604,6 +619,316 @@ return Restore; } +MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB, + MachineInstr *TermMI) { + LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ " + << *TermMI << "\n"); + + MachineBasicBlock *SplitBB = + BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS); + + // Convert last instruction in block to a terminator. + // Note: this only covers the expected patterns + unsigned NewOpcode = 0; + switch (TermMI->getOpcode()) { + case AMDGPU::S_AND_B32: + NewOpcode = AMDGPU::S_AND_B32_term; + break; + case AMDGPU::S_AND_B64: + NewOpcode = AMDGPU::S_AND_B64_term; + break; + case AMDGPU::S_MOV_B32: + NewOpcode = AMDGPU::S_MOV_B32_term; + break; + case AMDGPU::S_MOV_B64: + NewOpcode = AMDGPU::S_MOV_B64_term; + break; + default: + break; + } + if (NewOpcode) + TermMI->setDesc(TII->get(NewOpcode)); + + if (SplitBB != BB) { + // Update dominator trees + using DomTreeT = DomTreeBase; + SmallVector DTUpdates; + for (MachineBasicBlock *Succ : SplitBB->successors()) { + DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ}); + DTUpdates.push_back({DomTreeT::Delete, BB, Succ}); + } + DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB}); + if (MDT) + MDT->getBase().applyUpdates(DTUpdates); + if (PDT) + PDT->getBase().applyUpdates(DTUpdates); + + // Link blocks + MachineInstr *MI = + BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH)) + .addMBB(SplitBB); + LIS->InsertMachineInstrInMaps(*MI); + } + + return SplitBB; +} + +MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB, + MachineInstr &MI) { + const DebugLoc &DL = MI.getDebugLoc(); + unsigned Opcode = 0; + + assert(MI.getOperand(0).isReg()); + + // Comparison is for live lanes; however here we compute the inverse + // (killed lanes). This is because VCMP will always generate 0 bits + // for inactive lanes so a mask of live lanes would not be correct + // inside control flow. + // Invert the comparison by swapping the operands and adjusting + // the comparison codes. + + switch (MI.getOperand(2).getImm()) { + case ISD::SETOEQ: + case ISD::SETEQ: + Opcode = AMDGPU::V_CMP_LG_F32_e64; + break; + case ISD::SETOGT: + case ISD::SETGT: + Opcode = AMDGPU::V_CMP_GE_F32_e64; + break; + case ISD::SETOGE: + case ISD::SETGE: + Opcode = AMDGPU::V_CMP_GT_F32_e64; + break; + case ISD::SETOLT: + case ISD::SETLT: + Opcode = AMDGPU::V_CMP_LE_F32_e64; + break; + case ISD::SETOLE: + case ISD::SETLE: + Opcode = AMDGPU::V_CMP_LT_F32_e64; + break; + case ISD::SETONE: + case ISD::SETNE: + Opcode = AMDGPU::V_CMP_EQ_F32_e64; + break; + case ISD::SETO: + Opcode = AMDGPU::V_CMP_O_F32_e64; + break; + case ISD::SETUO: + Opcode = AMDGPU::V_CMP_U_F32_e64; + break; + case ISD::SETUEQ: + Opcode = AMDGPU::V_CMP_NLG_F32_e64; + break; + case ISD::SETUGT: + Opcode = AMDGPU::V_CMP_NLT_F32_e64; + break; + case ISD::SETUGE: + Opcode = AMDGPU::V_CMP_NLE_F32_e64; + break; + case ISD::SETULT: + Opcode = AMDGPU::V_CMP_NGT_F32_e64; + break; + case ISD::SETULE: + Opcode = AMDGPU::V_CMP_NGE_F32_e64; + break; + case ISD::SETUNE: + Opcode = AMDGPU::V_CMP_NEQ_F32_e64; + break; + default: + llvm_unreachable("invalid ISD:SET cond code"); + } + + // Pick opcode based on comparison type. + MachineInstr *VcmpMI; + const MachineOperand &Op0 = MI.getOperand(0); + const MachineOperand &Op1 = MI.getOperand(1); + if (TRI->isVGPR(*MRI, Op0.getReg())) { + Opcode = AMDGPU::getVOPe32(Opcode); + VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0); + } else { + VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)) + .addReg(AMDGPU::VCC, RegState::Define) + .addImm(0) // src0 modifiers + .add(Op1) + .addImm(0) // src1 modifiers + .add(Op0) + .addImm(0); // omod + } + + // VCC represents lanes killed. + Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; + + MachineInstr *MaskUpdateMI = + BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) + .addReg(LiveMaskReg) + .addReg(VCC); + + // State of SCC represents whether any lanes are live in mask, + // if SCC is 0 then no lanes will be alive anymore. + MachineInstr *EarlyTermMI = + BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0)); + + MachineInstr *ExecMaskMI = + BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC); + + assert(MBB.succ_size() == 1); + MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH)) + .addMBB(*MBB.succ_begin()); + + // Update live intervals + LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI); + MBB.remove(&MI); + + LIS->InsertMachineInstrInMaps(*MaskUpdateMI); + LIS->InsertMachineInstrInMaps(*ExecMaskMI); + LIS->InsertMachineInstrInMaps(*EarlyTermMI); + LIS->InsertMachineInstrInMaps(*NewTerm); + + return NewTerm; +} + +MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB, + MachineInstr &MI, bool IsWQM) { + const DebugLoc &DL = MI.getDebugLoc(); + MachineInstr *MaskUpdateMI = nullptr; + + const MachineOperand &Op = MI.getOperand(0); + int64_t KillVal = MI.getOperand(1).getImm(); + MachineInstr *ComputeKilledMaskMI = nullptr; + Register CndReg = !Op.isImm() ? Op.getReg() : Register(); + Register TmpReg; + + // Is this a static or dynamic kill? + if (Op.isImm()) { + if (Op.getImm() == KillVal) { + // Static: all active lanes are killed + MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) + .addReg(LiveMaskReg) + .addReg(Exec); + } else { + // Static: kill does nothing + MachineInstr *NewTerm = nullptr; + assert(MBB.succ_size() == 1); + NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH)) + .addMBB(*MBB.succ_begin()); + LIS->ReplaceMachineInstrInMaps(MI, *NewTerm); + MBB.remove(&MI); + return NewTerm; + } + } else { + if (!KillVal) { + // Op represents live lanes after kill, + // so exec mask needs to be factored in. + TmpReg = MRI->createVirtualRegister(TRI->getBoolRC()); + ComputeKilledMaskMI = + BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec); + MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) + .addReg(LiveMaskReg) + .addReg(TmpReg); + } else { + // Op represents lanes to kill + MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) + .addReg(LiveMaskReg) + .add(Op); + } + } + + // State of SCC represents whether any lanes are live in mask, + // if SCC is 0 then no lanes will be alive anymore. + MachineInstr *EarlyTermMI = + BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0)); + + // In the case we got this far some lanes are still live, + // update EXEC to deactivate lanes as appropriate. + MachineInstr *NewTerm; + if (Op.isImm()) { + unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0); + } else if (!IsWQM) { + NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec) + .addReg(Exec) + .addReg(LiveMaskReg); + } else { + unsigned Opcode = KillVal ? AndN2Opc : AndOpc; + NewTerm = + BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op); + } + + // Update live intervals + LIS->RemoveMachineInstrFromMaps(MI); + MBB.remove(&MI); + assert(EarlyTermMI); + assert(MaskUpdateMI); + assert(NewTerm); + if (ComputeKilledMaskMI) + LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI); + LIS->InsertMachineInstrInMaps(*MaskUpdateMI); + LIS->InsertMachineInstrInMaps(*EarlyTermMI); + LIS->InsertMachineInstrInMaps(*NewTerm); + + if (CndReg) { + LIS->removeInterval(CndReg); + LIS->createAndComputeVirtRegInterval(CndReg); + } + if (TmpReg) + LIS->createAndComputeVirtRegInterval(TmpReg); + + return NewTerm; +} + +// Replace (or supplement) instructions accessing live mask. +// This can only happen once all the live mask registers have been created +// and the execute state (WQM/WWM/Exact) of instructions is known. +void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) { + auto BII = Blocks.find(&MBB); + if (BII == Blocks.end()) + return; + + const BlockInfo &BI = BII->second; + if (!BI.NeedsLowering) + return; + + LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n"); + + SmallVector SplitPoints; + char State = BI.InitialState; + + auto II = MBB.getFirstNonPHI(), IE = MBB.end(); + while (II != IE) { + auto Next = std::next(II); + MachineInstr &MI = *II; + + if (StateTransition.count(&MI)) + State = StateTransition[&MI]; + + MachineInstr *SplitPoint = nullptr; + switch (MI.getOpcode()) { + case AMDGPU::SI_KILL_I1_TERMINATOR: + SplitPoint = lowerKillI1(MBB, MI, State == StateWQM); + break; + case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: + SplitPoint = lowerKillF32(MBB, MI); + break; + default: + break; + } + if (SplitPoint) + SplitPoints.push_back(SplitPoint); + + II = Next; + } + + // Perform splitting after instruction scan to simplify iteration. + if (!SplitPoints.empty()) { + MachineBasicBlock *BB = &MBB; + for (MachineInstr *MI : SplitPoints) { + BB = splitBlock(BB, MI); + } + } +} + // Return an iterator in the (inclusive) range [First, Last] at which // instructions can be safely inserted, keeping in mind that some of the // instructions we want to add necessarily clobber SCC. @@ -680,93 +1005,88 @@ void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SaveWQM, unsigned LiveMaskReg) { + Register SaveWQM) { MachineInstr *MI; if (SaveWQM) { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ? - AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64), - SaveWQM) + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndSaveExecOpc), SaveWQM) .addReg(LiveMaskReg); } else { - unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ? - AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64), - Exec) + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndOpc), Exec) .addReg(Exec) .addReg(LiveMaskReg); } LIS->InsertMachineInstrInMaps(*MI); + StateTransition[MI] = StateExact; } void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SavedWQM) { + Register SavedWQM) { MachineInstr *MI; - unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; if (SavedWQM) { MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec) .addReg(SavedWQM); } else { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ? - AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64), - Exec) - .addReg(Exec); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec); } LIS->InsertMachineInstrInMaps(*MI); + StateTransition[MI] = StateWQM; } void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SaveOrig) { + Register SaveOrig) { MachineInstr *MI; assert(SaveOrig); MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig) .addImm(-1); LIS->InsertMachineInstrInMaps(*MI); + StateTransition[MI] = StateWWM; } void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SavedOrig) { + Register SavedOrig, char NonWWMState) { MachineInstr *MI; assert(SavedOrig); - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), - ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC) + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), Exec) .addReg(SavedOrig); LIS->InsertMachineInstrInMaps(*MI); + StateTransition[MI] = NonWWMState; } -void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, - bool isEntry) { +void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) { auto BII = Blocks.find(&MBB); if (BII == Blocks.end()) return; - const BlockInfo &BI = BII->second; + BlockInfo &BI = BII->second; // This is a non-entry block that is WQM throughout, so no need to do // anything. - if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) + if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) { + BI.InitialState = StateWQM; return; + } LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB) << ":\n"); - unsigned SavedWQMReg = 0; - unsigned SavedNonWWMReg = 0; - bool WQMFromExec = isEntry; - char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM; + Register SavedWQMReg; + Register SavedNonWWMReg; + bool WQMFromExec = IsEntry; + char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM; char NonWWMState = 0; const TargetRegisterClass *BoolRC = TRI->getBoolRC(); auto II = MBB.getFirstNonPHI(), IE = MBB.end(); - if (isEntry) { + if (IsEntry) { // Skip the instruction that saves LiveMask if (II != IE && II->getOpcode() == AMDGPU::COPY) ++II; @@ -782,6 +1102,9 @@ // switch to/from WQM as well. MachineBasicBlock::iterator FirstWWM = IE; + // Record initial state is block information. + BI.InitialState = State; + for (;;) { MachineBasicBlock::iterator Next = II; char Needs = StateExact | StateWQM; // WWM is disabled by default @@ -846,7 +1169,7 @@ if (State == StateWWM) { assert(SavedNonWWMReg); - fromWWM(MBB, Before, SavedNonWWMReg); + fromWWM(MBB, Before, SavedNonWWMReg, NonWWMState); LIS->createAndComputeVirtRegInterval(SavedNonWWMReg); SavedNonWWMReg = 0; State = NonWWMState; @@ -865,7 +1188,7 @@ SavedWQMReg = MRI->createVirtualRegister(BoolRC); } - toExact(MBB, Before, SavedWQMReg, LiveMaskReg); + toExact(MBB, Before, SavedWQMReg); State = StateExact; } else if (State == StateExact && (Needs & StateWQM) && !(Needs & StateExact)) { @@ -901,7 +1224,7 @@ assert(!SavedNonWWMReg); } -void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) { +void SIWholeQuadMode::lowerLiveMaskQueries() { for (MachineInstr *MI : LiveMaskQueries) { const DebugLoc &DL = MI->getDebugLoc(); Register Dest = MI->getOperand(0).getReg(); @@ -933,7 +1256,7 @@ // And make it implicitly depend on exec (like all VALU movs should do). MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); - } else if (!MRI->isSSA()) { + } else { // Remove early-clobber and exec dependency from simple SGPR copies. // This allows some to be eliminated during/post RA. LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI); @@ -969,13 +1292,33 @@ } } +void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) { + for (MachineInstr *MI : KillInstrs) { + MachineBasicBlock *MBB = MI->getParent(); + MachineInstr *SplitPoint = nullptr; + switch (MI->getOpcode()) { + case AMDGPU::SI_KILL_I1_TERMINATOR: + SplitPoint = lowerKillI1(*MBB, *MI, IsWQM); + break; + case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: + SplitPoint = lowerKillF32(*MBB, *MI); + break; + default: + continue; + } + if (SplitPoint) + splitBlock(MBB, SplitPoint); + } +} + bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { Instructions.clear(); Blocks.clear(); LiveMaskQueries.clear(); LowerToCopyInstrs.clear(); LowerToMovInstrs.clear(); - CallingConv = MF.getFunction().getCallingConv(); + KillInstrs.clear(); + StateTransition.clear(); ST = &MF.getSubtarget(); @@ -983,64 +1326,72 @@ TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); LIS = &getAnalysis(); + MDT = &getAnalysis(); + PDT = &getAnalysis(); if (ST->isWave32()) { AndOpc = AMDGPU::S_AND_B32; - XorTermrOpc = AMDGPU::S_XOR_B32_term; + AndN2Opc = AMDGPU::S_ANDN2_B32; + XorOpc = AMDGPU::S_XOR_B32; + AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32; OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32; + WQMOpc = AMDGPU::S_WQM_B32; Exec = AMDGPU::EXEC_LO; } else { AndOpc = AMDGPU::S_AND_B64; - XorTermrOpc = AMDGPU::S_XOR_B64_term; + AndN2Opc = AMDGPU::S_ANDN2_B64; + XorOpc = AMDGPU::S_XOR_B64; + AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64; OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64; + WQMOpc = AMDGPU::S_WQM_B64; Exec = AMDGPU::EXEC; } - char GlobalFlags = analyzeFunction(MF); - unsigned LiveMaskReg = 0; - if (!(GlobalFlags & StateWQM)) { - lowerLiveMaskQueries(Exec); - if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty() && LowerToMovInstrs.empty()) - return !LiveMaskQueries.empty(); - } else { - // Store a copy of the original live mask when required - MachineBasicBlock &Entry = MF.front(); - MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); - - if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) { - LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC()); - MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(), - TII->get(AMDGPU::COPY), LiveMaskReg) - .addReg(Exec); - LIS->InsertMachineInstrInMaps(*MI); - } + const char GlobalFlags = analyzeFunction(MF); + const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty()); + + LiveMaskReg = Exec; - lowerLiveMaskQueries(LiveMaskReg); + // Shader is simple does not need WQM/WWM or any complex lowering + if (!(GlobalFlags & (StateWQM | StateWWM)) && LowerToCopyInstrs.empty() && + LowerToMovInstrs.empty() && KillInstrs.empty()) { + lowerLiveMaskQueries(); + return !LiveMaskQueries.empty(); + } - if (GlobalFlags == StateWQM) { - // For a shader that needs only WQM, we can just set it once. - auto MI = BuildMI(Entry, EntryMI, DebugLoc(), - TII->get(ST->isWave32() ? AMDGPU::S_WQM_B32 - : AMDGPU::S_WQM_B64), - Exec) - .addReg(Exec); - LIS->InsertMachineInstrInMaps(*MI); + MachineBasicBlock &Entry = MF.front(); + MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); - lowerCopyInstrs(); - // EntryMI may become invalid here - return true; - } + // Store a copy of the original live mask when required + if (NeedsLiveMask || (GlobalFlags & StateWQM)) { + LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC()); + MachineInstr *MI = + BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg) + .addReg(Exec); + LIS->InsertMachineInstrInMaps(*MI); } LLVM_DEBUG(printInfo()); + lowerLiveMaskQueries(); lowerCopyInstrs(); - // Handle the general case - for (auto BII : Blocks) - processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin()); + // Shader only needs WQM + if (GlobalFlags == StateWQM) { + auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec) + .addReg(Exec); + LIS->InsertMachineInstrInMaps(*MI); + lowerKillInstrs(true); + } else { + for (auto BII : Blocks) + processBlock(*BII.first, BII.first == &Entry); + // Lowering blocks causes block splitting so perform as a second pass. + for (auto BII : Blocks) + lowerBlock(*BII.first); + } - if (LiveMaskReg) + // Compute live range for live mask + if (LiveMaskReg != Exec) LIS->createAndComputeVirtRegInterval(LiveMaskReg); // Physical registers like SCC aren't tracked by default anyway, so just @@ -1048,5 +1399,9 @@ // the analysis results. LIS->removeRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI)); + // If we performed any kills then recompute EXEC + if (!KillInstrs.empty()) + LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI)); + return true; } diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -206,7 +206,8 @@ ; ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[10:11], exec +; GFX8-NEXT: s_mov_b64 s[8:9], exec +; GFX8-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] @@ -261,7 +262,8 @@ ; ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[10:11], exec +; GFX9-NEXT: s_mov_b64 s[8:9], exec +; GFX9-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] @@ -316,8 +318,9 @@ ; ; GFX1064-LABEL: add_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[10:11], exec +; GFX1064-NEXT: s_mov_b64 s[8:9], exec ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] ; GFX1064-NEXT: s_cbranch_execz BB1_4 @@ -378,8 +381,9 @@ ; ; GFX1032-LABEL: add_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s9, exec_lo +; GFX1032-NEXT: s_mov_b32 s8, exec_lo ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_mov_b32 s9, s8 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s8, s9 ; GFX1032-NEXT: s_cbranch_execz BB1_4 diff --git a/llvm/test/CodeGen/AMDGPU/early-term.mir b/llvm/test/CodeGen/AMDGPU/early-term.mir --- a/llvm/test/CodeGen/AMDGPU/early-term.mir +++ b/llvm/test/CodeGen/AMDGPU/early-term.mir @@ -14,10 +14,6 @@ ret void } - define amdgpu_ps void @early_term_scc0_with_kill() { - ret void - } - define amdgpu_gs void @early_term_scc0_gs() { ret void } @@ -152,59 +148,6 @@ S_ENDPGM 0 ... ---- -name: early_term_scc0_with_kill -tracksRegLiveness: true -liveins: - - { reg: '$sgpr0' } - - { reg: '$sgpr1' } - - { reg: '$vgpr2' } -body: | - ; CHECK-LABEL: name: early_term_scc0_with_kill - ; CHECK: bb.0: - ; CHECK: successors: %bb.1(0x80000000), %bb.3(0x00000000) - ; CHECK: liveins: $sgpr0, $sgpr1, $vgpr2 - ; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: V_CMPX_LE_F32_nosdst_e32 0, killed $vgpr2, implicit-def $exec, implicit $mode, implicit $exec - ; CHECK: S_CBRANCH_EXECZ %bb.3, implicit $exec - ; CHECK: bb.1: - ; CHECK: successors: %bb.4(0x40000000), %bb.3(0x40000000) - ; CHECK: liveins: $sgpr0, $sgpr1, $vgpr0 - ; CHECK: dead $sgpr0 = S_AND_B32 $sgpr0, killed $sgpr1, implicit-def $scc - ; CHECK: S_CBRANCH_SCC0 %bb.3, implicit $scc - ; CHECK: bb.4: - ; CHECK: successors: %bb.2(0x80000000) - ; CHECK: liveins: $vgpr0, $scc - ; CHECK: $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; CHECK: bb.2: - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK: EXP 1, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec - ; CHECK: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec - ; CHECK: S_ENDPGM 0 - ; CHECK: bb.3: - ; CHECK: $exec_lo = S_MOV_B32 0 - ; CHECK: EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec - ; CHECK: S_ENDPGM 0 - bb.0: - liveins: $sgpr0, $sgpr1, $vgpr2 - successors: %bb.1 - $vgpr0 = V_MOV_B32_e32 0, implicit $exec - SI_KILL_F32_COND_IMM_TERMINATOR killed $vgpr2, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec - - bb.1: - liveins: $sgpr0, $sgpr1, $vgpr0 - successors: %bb.2 - dead $sgpr0 = S_AND_B32 $sgpr0, killed $sgpr1, implicit-def $scc - SI_EARLY_TERMINATE_SCC0 implicit $scc, implicit $exec - $vgpr1 = V_MOV_B32_e32 1, implicit $exec - - bb.2: - liveins: $vgpr0, $vgpr1 - EXP 1, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec - EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec - S_ENDPGM 0 -... - --- name: early_term_scc0_gs tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir +++ /dev/null @@ -1,40 +0,0 @@ -# RUN: llc -march=amdgcn -mcpu=polaris10 -run-pass si-insert-skips -amdgpu-skip-threshold-legacy=1 %s -o - | FileCheck %s -# https://bugs.freedesktop.org/show_bug.cgi?id=99019 ---- | - define amdgpu_ps void @kill_uncond_branch() { - ret void - } -... ---- - -# CHECK-LABEL: name: kill_uncond_branch - -# CHECK: bb.0: -# CHECK: S_CBRANCH_VCCNZ %bb.1, implicit $vcc - -# CHECK: bb.1: -# CHECK: V_CMPX_LE_F32_e32 -# CHECK-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec - -# CHECK: bb.2: -# CHECK: S_ENDPGM 0 - -# CHECK: bb.3: -# CHECK-NEXT: EXP_DONE -# CHECK: S_ENDPGM 0 - -name: kill_uncond_branch - -body: | - bb.0: - successors: %bb.1 - S_CBRANCH_VCCNZ %bb.1, implicit $vcc - - bb.1: - successors: %bb.2 - $vgpr0 = V_MOV_B32_e32 0, implicit $exec - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec - S_BRANCH %bb.2 - - bb.2: - S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll @@ -14,14 +14,14 @@ %tmp3 = select i1 %tmp2, float 1.000000e+00, float -1.000000e+00 %c2 = fcmp oge float %tmp3, 0.0 call void @llvm.amdgcn.kill(i1 %c2) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}vcc_implicit_def: -; GCN-NOT: v_cmp_gt_f32_e32 vcc, +; GCN: v_cmp_gt_f32_e32 vcc, 0, v{{[0-9]+}} ; GCN: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}} -; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}} -; GFX10: v_cmpx_le_f32_e32 0, v{{[0-9]+}} +; GCN: s_andn2_b64 exec, exec, vcc ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]] define amdgpu_ps void @vcc_implicit_def(float %arg13, float %arg14) { %tmp0 = fcmp olt float %arg13, 0.000000e+00 @@ -29,12 +29,12 @@ call void @llvm.amdgcn.kill(i1 %c1) %tmp1 = select i1 %tmp0, float 1.000000e+00, float 0.000000e+00 call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}true: ; GCN-NEXT: %bb. -; GCN-NEXT: %bb. ; GCN-NEXT: s_endpgm define amdgpu_gs void @true() { call void @llvm.amdgcn.kill(i1 true) @@ -46,6 +46,7 @@ ; GCN: s_mov_b64 exec, 0 define amdgpu_gs void @false() { call void @llvm.amdgcn.kill(i1 false) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } @@ -53,12 +54,15 @@ ; GCN: v_cmp_lt_i32 ; GCN: v_cmp_lt_i32 ; GCN: s_or_b64 s[0:1] -; GCN: s_and_b64 exec, exec, s[0:1] +; GCN: s_xor_b64 s[0:1], s[0:1], exec +; GCN: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; GCN: s_and_b64 exec, exec, s[2:3] define amdgpu_gs void @and(i32 %a, i32 %b, i32 %c, i32 %d) { %c1 = icmp slt i32 %a, %b %c2 = icmp slt i32 %c, %d %x = or i1 %c1, %c2 call void @llvm.amdgcn.kill(i1 %x) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } @@ -66,154 +70,151 @@ ; GCN: v_cmp_lt_i32 ; GCN: v_cmp_lt_i32 ; GCN: s_xor_b64 s[0:1] -; GCN: s_andn2_b64 exec, exec, s[0:1] +; GCN: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; GCN: s_and_b64 exec, exec, s[2:3] define amdgpu_gs void @andn2(i32 %a, i32 %b, i32 %c, i32 %d) { %c1 = icmp slt i32 %a, %b %c2 = icmp slt i32 %c, %d %x = xor i1 %c1, %c2 %y = xor i1 %x, 1 call void @llvm.amdgcn.kill(i1 %y) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}oeq: -; GCN: v_cmpx_eq_f32 -; GCN-NOT: s_and +; GCN: v_cmp_lg_f32 define amdgpu_gs void @oeq(float %a) { %c1 = fcmp oeq float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}ogt: -; GCN: v_cmpx_lt_f32 -; GCN-NOT: s_and +; GCN: v_cmp_ge_f32 define amdgpu_gs void @ogt(float %a) { %c1 = fcmp ogt float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}oge: -; GCN: v_cmpx_le_f32 -; GCN-NOT: s_and +; GCN: v_cmp_gt_f32 define amdgpu_gs void @oge(float %a) { %c1 = fcmp oge float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}olt: -; GCN: v_cmpx_gt_f32 -; GCN-NOT: s_and +; GCN: v_cmp_le_f32 define amdgpu_gs void @olt(float %a) { %c1 = fcmp olt float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}ole: -; GCN: v_cmpx_ge_f32 -; GCN-NOT: s_and +; GCN: v_cmp_lt_f32 define amdgpu_gs void @ole(float %a) { %c1 = fcmp ole float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}one: -; GCN: v_cmpx_lg_f32 -; GCN-NOT: s_and +; GCN: v_cmp_eq_f32 define amdgpu_gs void @one(float %a) { %c1 = fcmp one float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}ord: -; FIXME: This is absolutely unimportant, but we could use the cmpx variant here. ; GCN: v_cmp_o_f32 define amdgpu_gs void @ord(float %a) { %c1 = fcmp ord float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}uno: -; FIXME: This is absolutely unimportant, but we could use the cmpx variant here. ; GCN: v_cmp_u_f32 define amdgpu_gs void @uno(float %a) { %c1 = fcmp uno float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}ueq: -; GCN: v_cmpx_nlg_f32 -; GCN-NOT: s_and +; GCN: v_cmp_nlg_f32 define amdgpu_gs void @ueq(float %a) { %c1 = fcmp ueq float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}ugt: -; GCN: v_cmpx_nge_f32 -; GCN-NOT: s_and +; GCN: v_cmp_nlt_f32 define amdgpu_gs void @ugt(float %a) { %c1 = fcmp ugt float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}uge: -; SI: v_cmpx_ngt_f32_e32 vcc, -1.0 -; GFX10: v_cmpx_ngt_f32_e32 -1.0 -; GCN-NOT: s_and +; GCN: v_cmp_nle_f32_e32 vcc, -1.0 define amdgpu_gs void @uge(float %a) { %c1 = fcmp uge float %a, -1.0 call void @llvm.amdgcn.kill(i1 %c1) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}ult: -; SI: v_cmpx_nle_f32_e32 vcc, -2.0 -; GFX10: v_cmpx_nle_f32_e32 -2.0 -; GCN-NOT: s_and +; GCN: v_cmp_ngt_f32_e32 vcc, -2.0 define amdgpu_gs void @ult(float %a) { %c1 = fcmp ult float %a, -2.0 call void @llvm.amdgcn.kill(i1 %c1) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}ule: -; SI: v_cmpx_nlt_f32_e32 vcc, 2.0 -; GFX10: v_cmpx_nlt_f32_e32 2.0 -; GCN-NOT: s_and +; GCN: v_cmp_nge_f32_e32 vcc, 2.0 define amdgpu_gs void @ule(float %a) { %c1 = fcmp ule float %a, 2.0 call void @llvm.amdgcn.kill(i1 %c1) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}une: -; SI: v_cmpx_neq_f32_e32 vcc, 0 -; GFX10: v_cmpx_neq_f32_e32 0 -; GCN-NOT: s_and +; GCN: v_cmp_neq_f32_e32 vcc, 0 define amdgpu_gs void @une(float %a) { %c1 = fcmp une float %a, 0.0 call void @llvm.amdgcn.kill(i1 %c1) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } ; GCN-LABEL: {{^}}neg_olt: -; SI: v_cmpx_ngt_f32_e32 vcc, 1.0 -; GFX10: v_cmpx_ngt_f32_e32 1.0 -; GCN-NOT: s_and +; GCN: v_cmp_nle_f32_e32 vcc, 1.0 define amdgpu_gs void @neg_olt(float %a) { %c1 = fcmp olt float %a, 1.0 %c2 = xor i1 %c1, 1 call void @llvm.amdgcn.kill(i1 %c2) + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) ret void } @@ -222,7 +223,7 @@ ; SI: v_cmp_lt_f32_e32 vcc, s{{[0-9]+}}, v0 ; GFX10: v_cmp_lt_f32_e32 vcc, 0x3e800000, v0 ; GCN: v_cndmask_b32 -; GCN: v_cmpx_le_f32 +; GCN: v_cmp_gt_f32 define amdgpu_ps void @fcmp_x2(float %a) #0 { %ogt = fcmp nsz ogt float %a, 2.500000e-01 %k = select i1 %ogt, float -1.000000e+00, float 0.000000e+00 @@ -231,20 +232,24 @@ ret void } +; Note: an almost identical test for this exists in llvm.amdgcn.wqm.vote.ll ; GCN-LABEL: {{^}}wqm: ; GCN: v_cmp_neq_f32_e32 vcc, 0 -; GCN: s_wqm_b64 s[0:1], vcc +; GCN-DAG: s_wqm_b64 s[2:3], vcc +; GCN-DAG: s_mov_b64 s[0:1], exec +; GCN: s_xor_b64 s[2:3], s[2:3], exec +; GCN: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GCN: s_and_b64 exec, exec, s[0:1] -define amdgpu_ps void @wqm(float %a) { +define amdgpu_ps float @wqm(float %a) { %c1 = fcmp une float %a, 0.0 %c2 = call i1 @llvm.amdgcn.wqm.vote(i1 %c1) call void @llvm.amdgcn.kill(i1 %c2) - ret void + ret float 0.0 } ; This checks that we use the 64-bit encoding when the operand is a SGPR. ; GCN-LABEL: {{^}}test_sgpr: -; GCN: v_cmpx_ge_f32_e64 +; GCN: v_cmp_gt_f32_e64 define amdgpu_ps void @test_sgpr(float inreg %a) #0 { %c = fcmp ole float %a, 1.000000e+00 call void @llvm.amdgcn.kill(i1 %c) #1 @@ -252,7 +257,7 @@ } ; GCN-LABEL: {{^}}test_non_inline_imm_sgpr: -; GCN-NOT: v_cmpx_ge_f32_e64 +; GCN-NOT: v_cmp_le_f32_e64 define amdgpu_ps void @test_non_inline_imm_sgpr(float inreg %a) #0 { %c = fcmp ole float %a, 1.500000e+00 call void @llvm.amdgcn.kill(i1 %c) #1 @@ -279,8 +284,42 @@ ret void } +; Check this compiles. +; If kill is marked as defining VCC then this will fail with live interval issues. +; GCN-LABEL: {{^}}kill_with_loop_exit: +; GCN: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec +; GCN: s_andn2_b64 [[LIVE]], [[LIVE]], exec +; GCN-NEXT: s_cbranch_scc0 +define amdgpu_ps void @kill_with_loop_exit(float inreg %inp0, float inreg %inp1, <4 x i32> inreg %inp2, float inreg %inp3) { +.entry: + %tmp24 = fcmp olt float %inp0, 1.280000e+02 + %tmp25 = fcmp olt float %inp1, 1.280000e+02 + %tmp26 = and i1 %tmp24, %tmp25 + br i1 %tmp26, label %bb35, label %.preheader1.preheader + +.preheader1.preheader: ; preds = %.entry + %tmp31 = fcmp ogt float %inp3, 0.0 + br label %bb + +bb: ; preds = %bb, %.preheader1.preheader + %tmp30 = phi float [ %tmp32, %bb ], [ 1.500000e+00, %.preheader1.preheader ] + %tmp32 = fadd reassoc nnan nsz arcp contract float %tmp30, 2.500000e-01 + %tmp34 = fadd reassoc nnan nsz arcp contract float %tmp30, 2.500000e-01 + br i1 %tmp31, label %bb, label %bb33 + +bb33: ; preds = %bb + call void @llvm.amdgcn.kill(i1 false) + br label %bb35 + +bb35: ; preds = %bb33, %.entry + %tmp36 = phi float [ %tmp34, %bb33 ], [ 1.000000e+00, %.entry ] + call void @llvm.amdgcn.exp.f32(i32 immarg 0, i32 immarg 15, float %tmp36, float %tmp36, float %tmp36, float %tmp36, i1 immarg true, i1 immarg true) #3 + ret void +} + declare void @llvm.amdgcn.kill(i1) #0 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare void @llvm.amdgcn.s.sendmsg(i32, i32) #0 declare i1 @llvm.amdgcn.wqm.vote(i1) attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll @@ -34,22 +34,27 @@ ret float %r } +; Note: an almost identical test for this exists in llvm.amdgcn.kill.ll ;CHECK-LABEL: {{^}}kill: ;CHECK: v_cmp_eq_u32_e32 [[CMP:[^,]+]], v0, v1 ;WAVE64: s_wqm_b64 [[WQM:[^,]+]], [[CMP]] -;WAVE64: s_and_b64 exec, exec, [[WQM]] +;WAVE64: s_xor_b64 [[KILL:[^,]+]], [[WQM]], exec +;WAVE64: s_andn2_b64 [[MASK:[^,]+]], [[EXEC:[^,]+]], [[KILL]] +;WAVE64: s_and_b64 exec, exec, [[MASK]] ;WAVE32: s_wqm_b32 [[WQM:[^,]+]], [[CMP]] -;WAVE32: s_and_b32 exec_lo, exec_lo, [[WQM]] +;WAVE32: s_xor_b32 [[KILL:[^,]+]], [[WQM]], exec +;WAVE32: s_andn2_b32 [[MASK:[^,]+]], [[EXEC:[^,]+]], [[KILL]] +;WAVE32: s_and_b32 exec_lo, exec_lo, [[MASK]] ;CHECK: s_endpgm -define amdgpu_ps void @kill(i32 %v0, i32 %v1) #1 { +define amdgpu_ps float @kill(i32 %v0, i32 %v1) #1 { main_body: %c = icmp eq i32 %v0, %v1 %w = call i1 @llvm.amdgcn.wqm.vote(i1 %c) call void @llvm.amdgcn.kill(i1 %w) - ret void + ret float 0.0 } declare void @llvm.amdgcn.kill(i1) #1 diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -2,7 +2,6 @@ ; CHECK-LABEL: {{^}}test_kill_depth_0_imm_pos: ; CHECK-NEXT: ; %bb.0: -; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_0_imm_pos() #0 { call void @llvm.amdgcn.kill(i1 true) @@ -11,11 +10,11 @@ ; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg: ; CHECK-NEXT: ; %bb.0: -; CHECK-NEXT: s_mov_b64 exec, 0 -; CHECK-NEXT: s_cbranch_execz BB1_2 -; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_andn2 +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB:BB[0-9]+_[0-9]+]] ; CHECK-NEXT: s_endpgm -; CHECK-NEXT: BB1_2: +; CHECK-NEXT: [[EXIT_BB]]: +; CHECK-NEXT: s_mov_b64 exec, 0 ; CHECK-NEXT: exp null off, off, off, off done vm ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_0_imm_neg() #0 { @@ -26,15 +25,17 @@ ; FIXME: Ideally only one would be emitted ; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg_x2: ; CHECK-NEXT: ; %bb.0: -; CHECK-NEXT: s_mov_b64 exec, 0 -; CHECK-NEXT: s_cbranch_execz BB2_3 +; CHECK-NEXT: s_mov_b64 s[0:1], exec +; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB:BB[0-9]+_[0-9]+]] ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: s_mov_b64 exec, 0 -; CHECK-NEXT: s_cbranch_execz BB2_3 -; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB]] ; CHECK-NEXT: s_endpgm -; CHECK-NEXT: BB2_3: -; CHECK: exp null +; CHECK-NEXT: [[EXIT_BB]]: +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: exp null ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_0_imm_neg_x2() #0 { call void @llvm.amdgcn.kill(i1 false) @@ -44,12 +45,13 @@ ; CHECK-LABEL: {{^}}test_kill_depth_var: ; CHECK-NEXT: ; %bb.0: -; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0 -; CHECK-NEXT: s_cbranch_execz BB3_2 -; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 +; CHECK-NEXT: s_andn2_b64 exec, exec, vcc +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB:BB[0-9]+_[0-9]+]] ; CHECK-NEXT: s_endpgm -; CHECK-NEXT: BB3_2: -; CHECK: exp null +; CHECK-NEXT: [[EXIT_BB]]: +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: exp null ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_var(float %x) #0 { %cmp = fcmp olt float %x, 0.0 @@ -60,15 +62,19 @@ ; FIXME: Ideally only one would be emitted ; CHECK-LABEL: {{^}}test_kill_depth_var_x2_same: ; CHECK-NEXT: ; %bb.0: -; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0 -; CHECK-NEXT: s_cbranch_execz BB4_3 +; CHECK-NEXT: s_mov_b64 s[0:1], exec +; CHECK-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 +; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB:BB[0-9]+_[0-9]+]] ; CHECK-NEXT: ; %bb.1: -; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0 -; CHECK-NEXT: s_cbranch_execz BB4_3 -; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_andn2_b64 exec, exec, vcc +; CHECK-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 +; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB]] ; CHECK-NEXT: s_endpgm -; CHECK-NEXT: BB4_3: -; CHECK: exp null +; CHECK-NEXT: [[EXIT_BB]]: +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: exp null ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_var_x2_same(float %x) #0 { %cmp = fcmp olt float %x, 0.0 @@ -80,15 +86,19 @@ ; FIXME: Ideally only one early-exit would be emitted ; CHECK-LABEL: {{^}}test_kill_depth_var_x2: ; CHECK-NEXT: ; %bb.0: -; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0 -; CHECK-NEXT: s_cbranch_execz BB5_3 -; CHECK-NEXT: ; %bb.1 -; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v1 -; CHECK-NEXT: s_cbranch_execz BB5_3 -; CHECK-NEXT: ; %bb.2 +; CHECK-NEXT: s_mov_b64 s[0:1], exec +; CHECK-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 +; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB:BB[0-9]+_[0-9]+]] +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_andn2_b64 exec, exec, vcc +; CHECK-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 +; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB]] ; CHECK-NEXT: s_endpgm -; CHECK-NEXT: BB5_3: -; CHECK: exp null +; CHECK-NEXT: [[EXIT_BB]]: +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: exp null ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_var_x2(float %x, float %y) #0 { %cmp.x = fcmp olt float %x, 0.0 @@ -100,15 +110,19 @@ ; CHECK-LABEL: {{^}}test_kill_depth_var_x2_instructions: ; CHECK-NEXT: ; %bb.0: -; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0 -; CHECK-NEXT: s_cbranch_execz BB6_3 +; CHECK-NEXT: s_mov_b64 s[0:1], exec +; CHECK-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 +; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB:BB[0-9]+_[0-9]+]] ; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_andn2_b64 exec, exec, vcc ; CHECK: v_mov_b32_e64 v7, -1 -; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7 -; CHECK-NEXT: s_cbranch_execz BB6_3 -; CHECK-NEXT: ; %bb.2: +; CHECK: v_cmp_le_f32_e32 vcc, 0, v7 +; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB]] ; CHECK-NEXT: s_endpgm -; CHECK-NEXT: BB6_3: +; CHECK-NEXT: [[EXIT_BB]]: +; CHECK-NEXT: s_mov_b64 exec, 0 ; CHECK-NEXT: exp null ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_var_x2_instructions(float %x) #0 { @@ -124,9 +138,12 @@ ; CHECK-LABEL: {{^}}test_kill_control_flow: ; CHECK: s_cmp_lg_u32 s{{[0-9]+}}, 0 -; CHECK: s_cbranch_scc1 [[RETURN_BB:BB[0-9]+_[0-9]+]] +; CHECK: s_cbranch_scc0 [[BODY_BB:BB[0-9]+_[0-9]+]] -; CHECK-NEXT: ; %bb.1: +; CHECK: v_mov_b32_e32 v0, 1.0 +; CHECK: s_branch [[RETURN_BB:BB[0-9]+_[0-9]+]] + +; [[BODY_BB]]: ; CHECK: v_mov_b32_e64 v7, -1 ; CHECK: v_nop_e64 ; CHECK: v_nop_e64 @@ -139,12 +156,17 @@ ; CHECK: v_nop_e64 ; CHECK: v_nop_e64 -; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7 - -; TODO: We could do an early-exit here (the branch above is uniform!) -; CHECK-NOT: exp null +; CHECK: v_cmp_le_f32_e32 vcc, 0, v7 +; CHECK-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB:BB[0-9]+_[0-9]+]] +; CHECK: s_andn2_b64 exec, exec, vcc ; CHECK: v_mov_b32_e32 v0, 1.0 + +; CHECK: [[EXIT_BB]] +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: exp null +; CHECK-NEXT: s_endpgm define amdgpu_ps float @test_kill_control_flow(i32 inreg %arg) #0 { entry: %cmp = icmp eq i32 %arg, 0 @@ -189,10 +211,9 @@ ; CHECK: ;;#ASMEND ; CHECK: v_mov_b32_e64 v8, -1 ; CHECK: ;;#ASMEND -; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7 - -; TODO: We could do an early-exit here (the branch above is uniform!) -; CHECK-NOT: exp null +; CHECK: v_cmp_le_f32_e32 vcc, 0, v7 +; CHECK-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB:BB[0-9]+_[0-9]+]] ; CHECK: buffer_store_dword v8 ; CHECK: v_mov_b32_e64 v9, -2 @@ -200,6 +221,11 @@ ; CHECK: {{^}}BB{{[0-9]+_[0-9]+}}: ; CHECK: buffer_store_dword v9 ; CHECK-NEXT: s_endpgm + +; CHECK: [[EXIT_BB]] +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: exp null +; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_control_flow_remainder(i32 inreg %arg) #0 { entry: %cmp = icmp eq i32 %arg, 0 @@ -234,9 +260,12 @@ ; CHECK-LABEL: {{^}}test_kill_control_flow_return: +; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec ; CHECK: v_cmp_eq_u32_e64 [[KILL_CC:s\[[0-9]+:[0-9]+\]]], s0, 1 -; CHECK: s_and_b64 exec, exec, s[2:3] -; CHECK-NEXT: s_cbranch_execz [[EXIT_BB:BB[0-9]+_[0-9]+]] +; CHECK: s_xor_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[KILL_CC]], exec +; CHECK: s_andn2_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], [[LIVE]], [[TMP]] +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB:BB[0-9]+_[0-9]+]] +; CHECK: s_and_b64 exec, exec, [[MASK]] ; CHECK: s_cmp_lg_u32 s{{[0-9]+}}, 0 ; CHECK: s_cbranch_scc0 [[COND_BB:BB[0-9]+_[0-9]+]] @@ -257,6 +286,7 @@ ; CHECK: v_mov_b32_e32 v0, v7 ; CHECK: [[EXIT_BB]]: +; CHECK-NEXT: s_mov_b64 exec, 0 ; CHECK-NEXT: exp null ; CHECK-NEXT: s_endpgm @@ -301,7 +331,9 @@ ; CHECK: v_mov_b32_e64 v7, -1 ; CHECK: v_nop_e64 -; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7 +; CHECK: v_cmp_le_f32_e32 vcc, 0, v7 +; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB:BB[0-9]+_[0-9]+]] ; CHECK-NEXT: ; %bb.3: ; CHECK: buffer_load_dword [[LOAD:v[0-9]+]] @@ -313,6 +345,11 @@ ; CHECK: s_or_b64 exec, exec, [[SAVEEXEC]] ; CHECK: buffer_store_dword ; CHECK: s_endpgm + +; CHECK: [[EXIT_BB]]: +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: exp null +; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { entry: %cmp = icmp eq i32 %arg, 0 @@ -345,10 +382,12 @@ ; bug 28550 ; CHECK-LABEL: {{^}}phi_use_def_before_kill: ; CHECK: v_cndmask_b32_e64 [[PHIREG:v[0-9]+]], 0, -1.0, -; CHECK: v_cmpx_lt_f32_e32 vcc, 0, -; CHECK-NEXT: s_cbranch_execz [[EXITBB:BB[0-9]+_[0-9]+]] +; CHECK: v_cmp_ge_f32_e32 vcc, 0, +; CHECK-NEXT: s_andn2_b64 +; CHECK-NEXT: s_cbranch_scc0 [[EXITBB:BB[0-9]+_[0-9]+]] ; CHECK: ; %[[KILLBB:bb.[0-9]+]]: +; CHECK-NEXT: s_andn2_b64 ; CHECK-NEXT: s_cbranch_scc0 [[PHIBB:BB[0-9]+_[0-9]+]] ; CHECK: [[PHIBB]]: @@ -363,7 +402,8 @@ ; CHECK-NEXT: s_endpgm ; CHECK: [[EXITBB]]: -; CHECK: exp null +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: exp null ; CHECK-NEXT: s_endpgm define amdgpu_ps void @phi_use_def_before_kill(float inreg %x) #0 { bb: @@ -395,13 +435,21 @@ ; CHECK: v_cmp_nge_f32 ; CHECK: s_cbranch_vccz [[SKIPKILL:BB[0-9]+_[0-9]+]] -; CHECK: ; %bb6 -; CHECK: s_mov_b64 exec, 0 +; FIXME: ideally this should just be a s_branch +; CHECK: s_mov_b64 s[2:3], exec +; CHECK-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; CHECK-NEXT: s_cbranch_scc0 [[EXIT_BB:BB[0-9]+_[0-9]+]] +; CHECK-NEXT: ; %bb6 +; CHECK-NEXT: s_mov_b64 exec, 0 ; CHECK: [[SKIPKILL]]: ; CHECK: v_cmp_nge_f32_e32 vcc -; CHECK: %bb.3: ; %bb5 -; CHECK-NEXT: .Lfunc_end{{[0-9]+}} +; CHECK: %bb.4: ; %bb5 + +; CHECK: [[EXIT_BB]] +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: exp null +; CHECK-NEXT: s_endpgm define amdgpu_ps void @no_skip_no_successors(float inreg %arg, float inreg %arg1) #0 { bb: %tmp = fcmp ult float %arg1, 0.000000e+00 @@ -430,8 +478,9 @@ ; CHECK: s_and_saveexec_b64 ; CHECK: s_xor_b64 -; CHECK: v_cmpx_gt_f32_e32 vcc, 0, -; CHECK: BB{{[0-9]+_[0-9]+}}: +; CHECK: v_cmp_le_f32_e32 vcc, 0, +; CHECK: s_cbranch_scc0 [[EXIT_BB:BB[0-9]+_[0-9]+]] + ; CHECK: s_or_b64 exec, exec ; CHECK: image_sample_c @@ -445,6 +494,12 @@ ; CHECK: [[END]]: ; CHECK: s_endpgm + +; CHECK: [[EXIT_BB]]: +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: exp null +; CHECK-NEXT: s_endpgm + define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, float %arg3) #0 { bb: %tmp = fcmp ult float %arg1, 0.000000e+00 @@ -470,10 +525,13 @@ } ; CHECK-LABEL: {{^}}cbranch_kill: +; CHECK: ; %bb.{{[0-9]+}}: ; %kill +; CHECK-NEXT: s_andn2 +; CHECK-NEXT: s_cbranch_scc0 [[EXIT:BB[0-9]+_[0-9]+]] ; CHECK: ; %bb.{{[0-9]+}}: ; %export ; CHECK-NEXT: s_or_b64 -; CHECK-NEXT: s_cbranch_execz [[EXIT:BB[0-9]+_[0-9]+]] ; CHECK: [[EXIT]]: +; CHECK-NEXT: s_mov_b64 exec, 0 ; CHECK-NEXT: exp null off, off, off, off done vm define amdgpu_ps void @cbranch_kill(i32 inreg %0, <2 x float> %1) { .entry: @@ -512,7 +570,7 @@ ; CHECK-LABEL: {{^}}complex_loop: ; CHECK: s_mov_b64 exec, 0 -; CHECK-NOT: exp null +; CHECK: exp null define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) { .entry: %flaga = icmp sgt i32 %cmpa, 0 diff --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll --- a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll +++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll @@ -84,46 +84,51 @@ ; GCN: successors: %bb.1(0x40000000), %bb.4(0x40000000) ; GCN: liveins: $vgpr0 ; GCN: renamable $vgpr1 = nofpexcept V_RCP_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GCN: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN: nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr1, implicit-def $vcc, implicit $mode, implicit $exec - ; GCN: $sgpr0_sgpr1 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GCN: renamable $sgpr0_sgpr1 = S_XOR_B64 $exec, killed renamable $sgpr0_sgpr1, implicit-def dead $scc + ; GCN: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GCN: renamable $sgpr2_sgpr3 = S_XOR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def dead $scc ; GCN: S_CBRANCH_EXECZ %bb.4, implicit $exec ; GCN: bb.1.flow.preheader: ; GCN: successors: %bb.2(0x80000000) - ; GCN: liveins: $vgpr0, $sgpr0_sgpr1 + ; GCN: liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2_sgpr3 ; GCN: nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $mode, implicit $exec - ; GCN: renamable $sgpr2_sgpr3 = S_MOV_B64 0 + ; GCN: renamable $sgpr4_sgpr5 = S_MOV_B64 0 ; GCN: bb.2.flow: ; GCN: successors: %bb.3(0x04000000), %bb.2(0x7c000000) - ; GCN: liveins: $vcc, $sgpr0_sgpr1, $sgpr2_sgpr3 - ; GCN: renamable $sgpr4_sgpr5 = S_AND_B64 $exec, renamable $vcc, implicit-def $scc - ; GCN: renamable $sgpr2_sgpr3 = S_OR_B64 killed renamable $sgpr4_sgpr5, killed renamable $sgpr2_sgpr3, implicit-def $scc - ; GCN: $exec = S_ANDN2_B64 $exec, renamable $sgpr2_sgpr3, implicit-def $scc + ; GCN: liveins: $vcc, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5 + ; GCN: renamable $sgpr6_sgpr7 = S_AND_B64 $exec, renamable $vcc, implicit-def $scc + ; GCN: renamable $sgpr4_sgpr5 = S_OR_B64 killed renamable $sgpr6_sgpr7, killed renamable $sgpr4_sgpr5, implicit-def $scc + ; GCN: $exec = S_ANDN2_B64 $exec, renamable $sgpr4_sgpr5, implicit-def $scc ; GCN: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; GCN: bb.3.Flow: ; GCN: successors: %bb.4(0x80000000) - ; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 - ; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc + ; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5 + ; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GCN: bb.4.Flow1: - ; GCN: successors: %bb.5(0x40000000), %bb.6(0x40000000) - ; GCN: liveins: $sgpr0_sgpr1 - ; GCN: renamable $sgpr0_sgpr1 = S_OR_SAVEEXEC_B64 killed renamable $sgpr0_sgpr1, implicit-def $exec, implicit-def $scc, implicit $exec - ; GCN: $exec = S_XOR_B64 $exec, renamable $sgpr0_sgpr1, implicit-def $scc - ; GCN: S_CBRANCH_EXECZ %bb.6, implicit $exec + ; GCN: successors: %bb.5(0x40000000) + ; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 + ; GCN: renamable $sgpr2_sgpr3 = S_OR_SAVEEXEC_B64 killed renamable $sgpr2_sgpr3, implicit-def $exec, implicit-def $scc, implicit $exec + ; GCN: $exec = S_XOR_B64 $exec, renamable $sgpr2_sgpr3, implicit-def $scc ; GCN: bb.5.kill0: + ; GCN: successors: %bb.8(0x40000000), %bb.7(0x40000000) + ; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 + ; GCN: dead renamable $sgpr0_sgpr1 = S_ANDN2_B64 killed renamable $sgpr0_sgpr1, $exec, implicit-def $scc + ; GCN: S_CBRANCH_SCC0 %bb.7, implicit $scc + ; GCN: bb.8.kill0: ; GCN: successors: %bb.6(0x80000000) - ; GCN: liveins: $sgpr0_sgpr1 + ; GCN: liveins: $sgpr2_sgpr3, $scc ; GCN: $exec = S_MOV_B64 0 ; GCN: bb.6.end: - ; GCN: successors: %bb.7(0x40000000), %bb.8(0x40000000) - ; GCN: liveins: $sgpr0_sgpr1 - ; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr0_sgpr1, implicit-def $scc - ; GCN: S_CBRANCH_EXECZ %bb.7, implicit $exec - ; GCN: S_BRANCH %bb.8 + ; GCN: successors: %bb.9(0x80000000) + ; GCN: liveins: $sgpr2_sgpr3 + ; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc + ; GCN: S_BRANCH %bb.9 ; GCN: bb.7: + ; GCN: $exec = S_MOV_B64 0 ; GCN: EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec ; GCN: S_ENDPGM 0 - ; GCN: bb.8: + ; GCN: bb.9: entry: %.i0 = fdiv reassoc nnan nsz arcp contract afn float 1.000000e+00, %val %cmp0 = fcmp olt float %.i0, 0.000000e+00 diff --git a/llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir b/llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir --- a/llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir @@ -1,4 +1,4 @@ -# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass si-insert-skips,post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: hazard_vcmpx_smov_exec_lo # GCN: $sgpr0 = S_MOV_B32 $exec_lo @@ -11,7 +11,7 @@ successors: %bb.1 $vgpr0 = V_MOV_B32_e32 0, implicit $exec $sgpr0 = S_MOV_B32 $exec_lo - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: @@ -29,7 +29,7 @@ successors: %bb.1 $vgpr0 = V_MOV_B32_e32 0, implicit $exec $sgpr0_sgpr1 = S_MOV_B64 $exec - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: @@ -45,7 +45,7 @@ bb.0: successors: %bb.1 $vgpr0 = V_MOV_B32_e32 $exec_lo, implicit $exec - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: @@ -61,7 +61,7 @@ bb.0: successors: %bb.1 $vgpr0 = V_MOV_B32_e32 0, implicit $exec - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: @@ -80,7 +80,7 @@ $vgpr0 = V_MOV_B32_e32 0, implicit $exec $sgpr0 = S_MOV_B32 $exec_lo $vgpr0 = V_ADDC_U32_e32 0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: @@ -99,7 +99,7 @@ $vgpr0 = V_MOV_B32_e32 0, implicit $exec $sgpr0 = S_MOV_B32 $exec_lo $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $vgpr0, 0, implicit $exec - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: @@ -118,7 +118,7 @@ $vgpr0 = V_MOV_B32_e32 0, implicit $exec $sgpr0 = S_MOV_B32 $exec_lo S_WAITCNT_DEPCTR 65534 - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: @@ -137,7 +137,7 @@ $vgpr0 = V_MOV_B32_e32 0, implicit $exec $sgpr0 = S_MOV_B32 $exec_lo S_WAITCNT_DEPCTR 65535 - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: @@ -156,7 +156,7 @@ $vgpr0 = V_MOV_B32_e32 0, implicit $exec $sgpr0 = S_MOV_B32 $exec_lo S_WAITCNT_DEPCTR 61438 - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir b/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir --- a/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir @@ -1,4 +1,4 @@ -# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-skips,post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: hazard_vcmpx_permlane16 # GCN: V_CMPX_LE_F32_nosdst_e32 @@ -11,7 +11,7 @@ bb.0: successors: %bb.1 $vgpr0 = V_MOV_B32_e32 0, implicit $exec - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: @@ -33,7 +33,7 @@ bb.0: successors: %bb.1 $vgpr0 = V_MOV_B32_e32 0, implicit $exec - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: @@ -56,7 +56,7 @@ bb.0: successors: %bb.1 $vgpr0 = V_MOV_B32_e32 0, implicit $exec - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: @@ -79,7 +79,7 @@ bb.0: successors: %bb.1 $vgpr0 = V_MOV_B32_e32 0, implicit $exec - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: @@ -110,7 +110,7 @@ bb.0: successors: %bb.1 $vgpr0 = V_MOV_B32_e32 0, implicit $exec - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: @@ -133,7 +133,7 @@ bb.0: successors: %bb.1 $vgpr0 = V_MOV_B32_e32 0, implicit $exec - SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec + V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec S_BRANCH %bb.1 bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -34,10 +34,10 @@ ret void } -; GCN-LABEL: {{^}}test_vopc_vcmpx: -; GFX1032: v_cmpx_le_f32_e32 0, v{{[0-9]+}} -; GFX1064: v_cmpx_le_f32_e32 0, v{{[0-9]+}} -define amdgpu_ps void @test_vopc_vcmpx(float %x) { +; GCN-LABEL: {{^}}test_vopc_vcmp: +; GFX1032: v_cmp_gt_f32_e32 vcc_lo, 0, v{{[0-9]+}} +; GFX1064: v_cmp_gt_f32_e32 vcc, 0, v{{[0-9]+}} +define amdgpu_ps void @test_vopc_vcmp(float %x) { %cmp = fcmp oge float %x, 0.0 call void @llvm.amdgcn.kill(i1 %cmp) ret void @@ -658,15 +658,22 @@ } ; GCN-LABEL: {{^}}test_kill_i1_terminator_i1: +; GFX1032: s_mov_b32 [[LIVE:s[0-9]+]], exec_lo ; GFX1032: s_or_b32 [[OR:s[0-9]+]], -; GFX1032: s_and_b32 exec_lo, exec_lo, [[OR]] +; GFX1032: s_xor_b32 [[KILL:s[0-9]+]], [[OR]], exec_lo +; GFX1032: s_andn2_b32 [[MASK:s[0-9]+]], [[LIVE]], [[KILL]] +; GFX1032: s_and_b32 exec_lo, exec_lo, [[MASK]] +; GFX1064: s_mov_b64 [[LIVE:s\[[0-9:]+\]]], exec ; GFX1064: s_or_b64 [[OR:s\[[0-9:]+\]]], -; GFX1064: s_and_b64 exec, exec, [[OR]] +; GFX1064: s_xor_b64 [[KILL:s\[[0-9:]+\]]], [[OR]], exec +; GFX1064: s_andn2_b64 [[MASK:s\[[0-9:]+\]]], [[LIVE]], [[KILL]] +; GFX1064: s_and_b64 exec, exec, [[MASK]] define amdgpu_gs void @test_kill_i1_terminator_i1(i32 %a, i32 %b, i32 %c, i32 %d) #0 { %c1 = icmp slt i32 %a, %b %c2 = icmp slt i32 %c, %d %x = or i1 %c1, %c2 call void @llvm.amdgcn.kill(i1 %x) + call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false) ret void } @@ -828,15 +835,22 @@ ; GCN-LABEL: {{^}}test_wqm_vote: ; GFX1032: v_cmp_neq_f32_e32 vcc_lo, 0 +; GFX1032: s_mov_b32 [[LIVE:s[0-9]+]], exec_lo ; GFX1032: s_wqm_b32 [[WQM:s[0-9]+]], vcc_lo -; GFX1032: s_and_b32 exec_lo, exec_lo, [[WQM]] +; GFX1032: s_xor_b32 [[KILL:s[0-9]+]], [[WQM]], exec_lo +; GFX1032: s_andn2_b32 [[MASK:s[0-9]+]], [[LIVE]], [[KILL]] +; GFX1032: s_and_b32 exec_lo, exec_lo, [[MASK]] ; GFX1064: v_cmp_neq_f32_e32 vcc, 0 -; GFX1064: s_wqm_b64 [[WQM:s\[[0-9:]+\]]], vcc{{$}} -; GFX1064: s_and_b64 exec, exec, [[WQM]] +; GFX1064: s_mov_b64 [[LIVE:s\[[0-9:]+\]]], exec +; GFX1064: s_wqm_b64 [[WQM:s\[[0-9:]+\]]], vcc +; GFX1064: s_xor_b64 [[KILL:s\[[0-9:]+\]]], [[WQM]], exec +; GFX1064: s_andn2_b64 [[MASK:s\[[0-9:]+\]]], [[LIVE]], [[KILL]] +; GFX1064: s_and_b64 exec, exec, [[MASK]] define amdgpu_ps void @test_wqm_vote(float %a) { %c1 = fcmp une float %a, 0.0 %c2 = call i1 @llvm.amdgcn.wqm.vote(i1 %c1) call void @llvm.amdgcn.kill(i1 %c2) + call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false) ret void } @@ -1125,9 +1139,11 @@ declare i1 @llvm.amdgcn.ps.live() declare i64 @llvm.cttz.i64(i64, i1) declare i32 @llvm.cttz.i32(i32, i1) +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #5 attributes #0 = { nounwind readnone speculatable } attributes #1 = { nounwind } attributes #2 = { nounwind readnone optnone noinline } attributes #3 = { "target-features"="+wavefrontsize32" } attributes #4 = { "target-features"="+wavefrontsize64" } +attributes #5 = { inaccessiblememonly nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -576,7 +576,7 @@ ;CHECK: image_sample ;CHECK: buffer_store_dword ;CHECK: s_wqm_b64 exec, exec -;CHECK: v_cmpx_ +;CHECK: v_cmp_ ;CHECK: image_sample ;CHECK: s_and_b64 exec, exec, [[ORIG]] ;CHECK: image_sample @@ -611,9 +611,9 @@ ; CHECK: image_sample ; CHECK: s_and_b64 exec, exec, [[ORIG]] ; CHECK: image_sample -; CHECK: buffer_store_dword ; CHECK-NOT: wqm -; CHECK: v_cmpx_ +; CHECK-DAG: buffer_store_dword +; CHECK-DAG: v_cmp_ define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { main_body: %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0