Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -39,7 +39,6 @@ FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm); FunctionPass *createSIWholeQuadModePass(); -FunctionPass *createSILowerControlFlowPass(); FunctionPass *createSIFixControlFlowLiveIntervalsPass(); FunctionPass *createSIFixSGPRCopiesPass(); FunctionPass *createSIDebuggerInsertNopsPass(); @@ -69,8 +68,10 @@ extern char &SIWholeQuadModeID; void initializeSILowerControlFlowPass(PassRegistry &); -extern char &SILowerControlFlowPassID; +extern char &SILowerControlFlowID; +void initializeSIInsertSkipsPass(PassRegistry &); +extern char &SIInsertSkipsPassID; // Passes common to R600 and SI FunctionPass *createAMDGPUPromoteAlloca(const TargetMachine *TM = nullptr); Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -80,6 +80,7 @@ initializeSIInsertWaitsPass(*PR); initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); + initializeSIInsertSkipsPass(*PR); initializeSIDebuggerInsertNopsPass(*PR); } @@ -532,13 +533,6 @@ #endif void GCNPassConfig::addPreRegAlloc() { - // This needs to be run directly before register allocation because - // earlier passes might recompute live intervals. - // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass - if (getOptLevel() > CodeGenOpt::None) { - insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); - } - if (getOptLevel() > CodeGenOpt::None) { // Don't do this with no optimizations since it throws away debug info by // merging nonadjacent loads. @@ -556,10 +550,22 @@ } void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { + // FIXME: We have to disable the verifier here because of PHIElimination + + // TwoAddressInstructions disabling it. + insertPass(&TwoAddressInstructionPassID, &SILowerControlFlowID, false); + TargetPassConfig::addFastRegAlloc(RegAllocPass); } void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { + // This needs to be run directly before register allocation because earlier + // passes might recompute live intervals. + insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); + + // TODO: It might be better to run this right after phi elimination, but for + // now that would require not running the verifier. + insertPass(&RenameIndependentSubregsID, &SILowerControlFlowID); + TargetPassConfig::addOptimizedRegAlloc(RegAllocPass); } @@ -579,7 +585,7 @@ addPass(createSIInsertWaitsPass()); addPass(createSIShrinkInstructionsPass()); - addPass(createSILowerControlFlowPass()); + addPass(&SIInsertSkipsPassID); addPass(createSIDebuggerInsertNopsPass()); } Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -67,6 +67,7 @@ SIFixSGPRCopies.cpp SIFoldOperands.cpp SIFrameLowering.cpp + SIInsertSkips.cpp SIInsertWaits.cpp SIInstrInfo.cpp SIISelLowering.cpp Index: lib/Target/AMDGPU/SIInsertSkips.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/SIInsertSkips.cpp @@ -0,0 +1,330 @@ +//===-- SIInsertSkips.cpp - Use predicates for control flow ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This pass inserts branches on the 0 exec mask over divergent branches +/// branches when it's expected that jumping over the untaken control flow will +/// be cheaper than having every workitem no-op through it. +// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/MC/MCAsmInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-insert-skips" + +namespace { + +static cl::opt SkipThresholdFlag( + "amdgpu-skip-threshold", + cl::desc("Number of instructions before jumping over divergent control flow"), + cl::init(12), cl::Hidden); + +class SIInsertSkips : public MachineFunctionPass { +private: + const SIRegisterInfo *TRI; + const SIInstrInfo *TII; + unsigned SkipThreshold; + + bool shouldSkip(const MachineBasicBlock &From, + const MachineBasicBlock &To) const; + + bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB); + + void kill(MachineInstr &MI); + + MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const; + + bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB); + +public: + static char ID; + + SIInsertSkips() : + MachineFunctionPass(ID), TRI(nullptr), TII(nullptr), SkipThreshold(0) { } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI insert s_cbranch_execz instructions"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace + +char SIInsertSkips::ID = 0; + +INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE, + "SI insert s_cbranch_execz instructions", false, false) + +char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID; + +static bool opcodeEmitsNoInsts(unsigned Opc) { + switch (Opc) { + case TargetOpcode::IMPLICIT_DEF: + case TargetOpcode::KILL: + case TargetOpcode::BUNDLE: + case TargetOpcode::CFI_INSTRUCTION: + case TargetOpcode::EH_LABEL: + case TargetOpcode::GC_LABEL: + case TargetOpcode::DBG_VALUE: + return true; + default: + return false; + } +} + +bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From, + const MachineBasicBlock &To) const { + if (From.succ_empty()) + return false; + + unsigned NumInstr = 0; + const MachineFunction *MF = From.getParent(); + + for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end(); + MBBI != End && MBBI != ToI; ++MBBI) { + const MachineBasicBlock &MBB = *MBBI; + + for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end(); + NumInstr < SkipThreshold && I != E; ++I) { + if (opcodeEmitsNoInsts(I->getOpcode())) + continue; + + // FIXME: Since this is required for correctness, this should be inserted + // during SILowerControlFlow. + + // When a uniform loop is inside non-uniform control flow, the branch + // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken + // when EXEC = 0. We should skip the loop lest it becomes infinite. + if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ || + I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) + return true; + + if (I->isInlineAsm()) { + const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); + const char *AsmStr = I->getOperand(0).getSymbolName(); + + // inlineasm length estimate is number of bytes assuming the longest + // instruction. + uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI); + NumInstr += MaxAsmSize / MAI->getMaxInstLength(); + } else { + ++NumInstr; + } + + if (NumInstr >= SkipThreshold) + return true; + } + } + + return false; +} + +bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) { + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction *MF = MBB.getParent(); + + if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS || + !shouldSkip(MBB, MBB.getParent()->back())) + return false; + + MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator()); + + const DebugLoc &DL = MI.getDebugLoc(); + + // If the exec mask is non-zero, skip the next two instructions + BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addMBB(&NextBB); + + MachineBasicBlock::iterator Insert = SkipBB->begin(); + + // Exec mask is zero: Export to NULL target... + BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP)) + .addImm(0) + .addImm(0x09) // V_008DFC_SQ_EXP_NULL + .addImm(0) + .addImm(1) + .addImm(1) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef); + + // ... and terminate wavefront. + BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); + + return true; +} + +void SIInsertSkips::kill(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + const MachineOperand &Op = MI.getOperand(0); + +#ifndef NDEBUG + CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv(); + // Kill is only allowed in pixel / geometry shaders. + assert(CallConv == CallingConv::AMDGPU_PS || + CallConv == CallingConv::AMDGPU_GS); +#endif + // Clear this thread from the exec mask if the operand is negative. + if (Op.isImm()) { + // Constant operand: Set exec mask to 0 or do nothing + if (Op.getImm() & 0x80000000) { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + .addImm(0); + } + } else { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32)) + .addImm(0) + .addOperand(Op); + } +} + +MachineBasicBlock *SIInsertSkips::insertSkipBlock( + MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { + MachineFunction *MF = MBB.getParent(); + + MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock(); + MachineFunction::iterator MBBI(MBB); + ++MBBI; + + MF->insert(MBBI, SkipBB); + MBB.addSuccessor(SkipBB); + + return SkipBB; +} + +// Returns true if a branch over the block was inserted. +bool SIInsertSkips::skipMaskBranch(MachineInstr &MI, + MachineBasicBlock &SrcMBB) { + MachineBasicBlock *DestBB = MI.getOperand(0).getMBB(); + + if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB)) + return false; + + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock::iterator InsPt = std::next(MI.getIterator()); + + BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) + .addMBB(DestBB); + + return true; +} + +bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { + const SISubtarget &ST = MF.getSubtarget(); + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + SkipThreshold = SkipThresholdFlag; + + bool HaveKill = false; + bool MadeChange = false; + + // Track depth of exec mask, divergent branches. + SmallVector ExecBranchStack; + + MachineFunction::iterator NextBB; + + MachineBasicBlock *EmptyMBBAtEnd = nullptr; + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; BI = NextBB) { + NextBB = std::next(BI); + MachineBasicBlock &MBB = *BI; + + if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) { + // Reached convergence point for last divergent branch. + ExecBranchStack.pop_back(); + } + + if (HaveKill && ExecBranchStack.empty()) { + HaveKill = false; + + // TODO: Insert skip if exec is 0? + } + + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + + MachineInstr &MI = *I; + + switch (MI.getOpcode()) { + case AMDGPU::SI_MASK_BRANCH: { + ExecBranchStack.push_back(MI.getOperand(0).getMBB()); + MadeChange |= skipMaskBranch(MI, MBB); + break; + } + case AMDGPU::S_BRANCH: { + // Optimize out branches to the next block. + // FIXME: Shouldn't this be handled by BranchFolding? + if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) + MI.eraseFromParent(); + break; + } + case AMDGPU::SI_KILL_TERMINATOR: { + MadeChange = true; + kill(MI); + + if (ExecBranchStack.empty()) { + if (skipIfDead(MI, *NextBB)) { + NextBB = std::next(BI); + BE = MF.end(); + Next = MBB.end(); + } + } else { + HaveKill = true; + } + + MI.eraseFromParent(); + break; + } + case AMDGPU::SI_RETURN: { + // FIXME: Should move somewhere else + assert(!MF.getInfo()->returnsVoid()); + + // Graphics shaders returning non-void shouldn't contain S_ENDPGM, + // because external bytecode will be appended at the end. + if (BI != --MF.end() || I != MBB.getFirstTerminator()) { + // SI_RETURN is not the last instruction. Add an empty block at + // the end and jump there. + if (!EmptyMBBAtEnd) { + EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); + MF.insert(MF.end(), EmptyMBBAtEnd); + } + + MBB.addSuccessor(EmptyMBBAtEnd); + BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) + .addMBB(EmptyMBBAtEnd); + I->eraseFromParent(); + } + } + default: + break; + } + } + } + + return MadeChange; +} Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -1807,6 +1807,7 @@ let isTerminator = 1; let isBarrier = 0; let SALU = 1; + let Uses = [EXEC]; } let Uses = [EXEC], Defs = [EXEC, SCC] in { Index: lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- lib/Target/AMDGPU/SILowerControlFlow.cpp +++ lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -58,8 +58,6 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/Constants.h" -#include "llvm/MC/MCAsmInfo.h" using namespace llvm; @@ -67,46 +65,50 @@ namespace { -static cl::opt SkipThresholdFlag( - "amdgpu-skip-threshold", - cl::desc("Number of instructions before jumping over divergent control flow"), - cl::init(12), cl::Hidden); +// TODO: Remove this option +static cl::opt EnableNoSaveExecLowering( + "amdgpu-no-saveexec-control-flow", + cl::desc("Don't use saveexec instructions to lower control flow"), + cl::ReallyHidden, + cl::init(false)); class SILowerControlFlow : public MachineFunctionPass { private: const SIRegisterInfo *TRI; const SIInstrInfo *TII; - unsigned SkipThreshold; + LiveIntervals *LIS; + bool IsNoOpt; - bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To); + void emitIf(MachineInstr &MI); + void emitElse(MachineInstr &MI); + void emitBreak(MachineInstr &MI); + void emitIfBreak(MachineInstr &MI); + void emitElseBreak(MachineInstr &MI); + void emitLoop(MachineInstr &MI); + void emitEndCf(MachineInstr &MI); - MachineInstr *Skip(MachineInstr &From, MachineOperand &To); - bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB); - - void If(MachineInstr &MI); - void Else(MachineInstr &MI); - void Break(MachineInstr &MI); - void IfBreak(MachineInstr &MI); - void ElseBreak(MachineInstr &MI); - void Loop(MachineInstr &MI); - void EndCf(MachineInstr &MI); - - void Kill(MachineInstr &MI); - void Branch(MachineInstr &MI); - - MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) const; public: static char ID; SILowerControlFlow() : - MachineFunctionPass(ID), TRI(nullptr), TII(nullptr), SkipThreshold(0) { } + MachineFunctionPass(ID), + TRI(nullptr), + TII(nullptr), + LIS(nullptr), + IsNoOpt(false) { } bool runOnMachineFunction(MachineFunction &MF) override; const char *getPassName() const override { return "SI Lower control flow pseudo instructions"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved(); + AU.addPreserved(); + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } }; } // End anonymous namespace @@ -114,403 +116,248 @@ char SILowerControlFlow::ID = 0; INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE, - "SI lower control flow", false, false) - -char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID; - - -FunctionPass *llvm::createSILowerControlFlowPass() { - return new SILowerControlFlow(); -} - -static bool opcodeEmitsNoInsts(unsigned Opc) { - switch (Opc) { - case TargetOpcode::IMPLICIT_DEF: - case TargetOpcode::KILL: - case TargetOpcode::BUNDLE: - case TargetOpcode::CFI_INSTRUCTION: - case TargetOpcode::EH_LABEL: - case TargetOpcode::GC_LABEL: - case TargetOpcode::DBG_VALUE: - return true; - default: - return false; - } -} - -bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From, - MachineBasicBlock *To) { - if (From->succ_empty()) - return false; - - unsigned NumInstr = 0; - MachineFunction *MF = From->getParent(); - - for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end(); - MBBI != End && MBBI != ToI; ++MBBI) { - MachineBasicBlock &MBB = *MBBI; - - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - NumInstr < SkipThreshold && I != E; ++I) { - if (opcodeEmitsNoInsts(I->getOpcode())) - continue; - - // When a uniform loop is inside non-uniform control flow, the branch - // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken - // when EXEC = 0. We should skip the loop lest it becomes infinite. - if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ || - I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) - return true; - - if (I->isInlineAsm()) { - const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); - const char *AsmStr = I->getOperand(0).getSymbolName(); - - // inlineasm length estimate is number of bytes assuming the longest - // instruction. - uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI); - NumInstr += MaxAsmSize / MAI->getMaxInstLength(); - } else { - ++NumInstr; - } + "SI lower control flow", false, false) - if (NumInstr >= SkipThreshold) - return true; - } - } +char &llvm::SILowerControlFlowID = SILowerControlFlow::ID; - return false; -} +void SILowerControlFlow::emitIf(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock::iterator I(&MI); -MachineInstr *SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) { - if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB())) - return nullptr; + MachineOperand &SaveExec = MI.getOperand(0); + MachineOperand &Cond = MI.getOperand(1); + assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister && + Cond.getSubReg() == AMDGPU::NoSubRegister); - const DebugLoc &DL = From.getDebugLoc(); - MachineInstr *Skip = - BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) - .addOperand(To); - return Skip; -} - -bool SILowerControlFlow::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) { - MachineBasicBlock &MBB = *MI.getParent(); - MachineFunction *MF = MBB.getParent(); + unsigned SaveExecReg = SaveExec.getReg(); - if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS || - !shouldSkip(&MBB, &MBB.getParent()->back())) - return false; + MachineInstr *AndSaveExec = + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), SaveExecReg) + .addOperand(Cond); - MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator()); - MBB.addSuccessor(SkipBB); + MachineInstr *Xor = + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg) + .addReg(AMDGPU::EXEC) + .addReg(SaveExecReg); - const DebugLoc &DL = MI.getDebugLoc(); + // Insert a pseudo terminator to help keep the verifier happy. This will also + // be used later when inserting skips. + MachineInstr *NewBr = + BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) + .addOperand(MI.getOperand(2)) + .addReg(SaveExecReg, getKillRegState(SaveExec.isKill())); - // If the exec mask is non-zero, skip the next two instructions - BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addMBB(&NextBB); + if (!LIS) { + MI.eraseFromParent(); + return; + } - MachineBasicBlock::iterator Insert = SkipBB->begin(); - // Exec mask is zero: Export to NULL target... - BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP)) - .addImm(0) - .addImm(0x09) // V_008DFC_SQ_EXP_NULL - .addImm(0) - .addImm(1) - .addImm(1) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addReg(AMDGPU::VGPR0, RegState::Undef); + LIS->ReplaceMachineInstrInMaps(MI, *AndSaveExec); + LIS->InsertMachineInstrInMaps(*Xor); + LIS->InsertMachineInstrInMaps(*NewBr); - // ... and terminate wavefront. - BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); + MI.eraseFromParent(); - return true; + // FIXME: Is there a better way of adjusting the liveness? It shouldn't be + // hard to add another def here but I'm not sure how to correctly update the + // valno. + LIS->removeInterval(SaveExecReg); + LIS->createAndComputeVirtRegInterval(SaveExecReg); } -void SILowerControlFlow::If(MachineInstr &MI) { +void SILowerControlFlow::emitElse(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - unsigned Reg = MI.getOperand(0).getReg(); - unsigned Vcc = MI.getOperand(1).getReg(); + const DebugLoc &DL = MI.getDebugLoc(); - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg) - .addReg(Vcc); + unsigned DstReg = MI.getOperand(0).getReg(); + assert(MI.getOperand(0).getSubReg() == AMDGPU::NoSubRegister); - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg) - .addReg(AMDGPU::EXEC) - .addReg(Reg); + bool ExecModified = MI.getOperand(3).getImm() != 0; + MachineBasicBlock::iterator Start = MBB.begin(); - MachineInstr *SkipInst = Skip(MI, MI.getOperand(2)); + // This must be inserted before phis and any spill code inserted before the + // else. + MachineInstr *OrSaveExec = + BuildMI(MBB, Start, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), DstReg) + .addOperand(MI.getOperand(1)); // Saved EXEC + MachineBasicBlock *DestBB = MI.getOperand(2).getMBB(); - // Insert before the new branch instruction. - MachineInstr *InsPt = SkipInst ? SkipInst : &MI; + MachineBasicBlock::iterator ElsePt(MI); - // Insert a pseudo terminator to help keep the verifier happy. - BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) - .addOperand(MI.getOperand(2)) - .addReg(Reg); + if (ExecModified) { + MachineInstr *And = + BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_AND_B64), DstReg) + .addReg(AMDGPU::EXEC) + .addReg(DstReg); - MI.eraseFromParent(); -} - -void SILowerControlFlow::Else(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Src = MI.getOperand(1).getReg(); - - BuildMI(MBB, MBB.getFirstNonPHI(), DL, - TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst) - .addReg(Src); // Saved EXEC - - if (MI.getOperand(3).getImm() != 0) { - // Adjust the saved exec to account for the modifications during the flow - // block that contains the ELSE. This can happen when WQM mode is switched - // off. - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst) - .addReg(AMDGPU::EXEC) - .addReg(Dst); + if (LIS) + LIS->InsertMachineInstrInMaps(*And); } - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(Dst); - - MachineInstr *SkipInst = Skip(MI, MI.getOperand(2)); - - // Insert before the new branch instruction. - MachineInstr *InsPt = SkipInst ? SkipInst : &MI; + MachineInstr *Xor = + BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(DstReg); + MachineBasicBlock::iterator Term = MBB.getFirstTerminator(); // Insert a pseudo terminator to help keep the verifier happy. - BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) - .addOperand(MI.getOperand(2)) - .addReg(Dst); + MachineInstr *Branch = + BuildMI(MBB, Term, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) + .addMBB(DestBB) + .addReg(DstReg); + if (!LIS) { + MI.eraseFromParent(); + return; + } + + LIS->RemoveMachineInstrFromMaps(MI); MI.eraseFromParent(); -} -void SILowerControlFlow::Break(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); + LIS->InsertMachineInstrInMaps(*OrSaveExec); - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Src = MI.getOperand(1).getReg(); + LIS->InsertMachineInstrInMaps(*Xor); + LIS->InsertMachineInstrInMaps(*Branch); - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) - .addReg(AMDGPU::EXEC) - .addReg(Src); + // src reg is tied to dst reg. + LIS->removeInterval(DstReg); + LIS->createAndComputeVirtRegInterval(DstReg); - MI.eraseFromParent(); + // Let this be recomputed. + LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI)); } -void SILowerControlFlow::IfBreak(MachineInstr &MI) { +void SILowerControlFlow::emitBreak(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - + const DebugLoc &DL = MI.getDebugLoc(); unsigned Dst = MI.getOperand(0).getReg(); - unsigned Vcc = MI.getOperand(1).getReg(); - unsigned Src = MI.getOperand(2).getReg(); - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) - .addReg(Vcc) - .addReg(Src); + MachineInstr *Or = + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + .addReg(AMDGPU::EXEC) + .addOperand(MI.getOperand(1)); + if (LIS) + LIS->ReplaceMachineInstrInMaps(MI, *Or); MI.eraseFromParent(); } -void SILowerControlFlow::ElseBreak(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Saved = MI.getOperand(1).getReg(); - unsigned Src = MI.getOperand(2).getReg(); - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) - .addReg(Saved) - .addReg(Src); +void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { + MI.setDesc(TII->get(AMDGPU::S_OR_B64)); +} - MI.eraseFromParent(); +void SILowerControlFlow::emitElseBreak(MachineInstr &MI) { + MI.setDesc(TII->get(AMDGPU::S_OR_B64)); } -void SILowerControlFlow::Loop(MachineInstr &MI) { +void SILowerControlFlow::emitLoop(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - unsigned Src = MI.getOperand(0).getReg(); + const DebugLoc &DL = MI.getDebugLoc(); - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(Src); + MachineInstr *AndN2 = + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addOperand(MI.getOperand(0)); - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + MachineInstr *Branch = + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) .addOperand(MI.getOperand(1)); - MI.eraseFromParent(); -} - -void SILowerControlFlow::EndCf(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - unsigned Reg = MI.getOperand(0).getReg(); - - BuildMI(MBB, MBB.getFirstNonPHI(), DL, - TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(Reg); + if (LIS) { + LIS->ReplaceMachineInstrInMaps(MI, *AndN2); + LIS->InsertMachineInstrInMaps(*Branch); + } MI.eraseFromParent(); } -void SILowerControlFlow::Branch(MachineInstr &MI) { - MachineBasicBlock *MBB = MI.getOperand(0).getMBB(); - if (MBB == MI.getParent()->getNextNode()) - MI.eraseFromParent(); - - // If these aren't equal, this is probably an infinite loop. -} - -void SILowerControlFlow::Kill(MachineInstr &MI) { +void SILowerControlFlow::emitEndCf(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - const MachineOperand &Op = MI.getOperand(0); - -#ifndef NDEBUG - CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv(); - // Kill is only allowed in pixel / geometry shaders. - assert(CallConv == CallingConv::AMDGPU_PS || - CallConv == CallingConv::AMDGPU_GS); -#endif - - // Clear this thread from the exec mask if the operand is negative - if ((Op.isImm())) { - // Constant operand: Set exec mask to 0 or do nothing - if (Op.getImm() & 0x80000000) { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) - .addImm(0); - } - } else { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32)) - .addImm(0) - .addOperand(Op); - } - - MI.eraseFromParent(); -} + const DebugLoc &DL = MI.getDebugLoc(); -MachineBasicBlock *SILowerControlFlow::insertSkipBlock( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - MachineFunction *MF = MBB.getParent(); + MachineBasicBlock::iterator InsPt = MBB.begin(); + MachineInstr *NewMI = + BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addOperand(MI.getOperand(0)); - MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock(); - MachineFunction::iterator MBBI(MBB); - ++MBBI; + if (LIS) + LIS->ReplaceMachineInstrInMaps(MI, *NewMI); - MF->insert(MBBI, SkipBB); + MI.eraseFromParent(); - return SkipBB; + if (LIS) + LIS->handleMove(*NewMI); } bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { const SISubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); - SkipThreshold = SkipThresholdFlag; - bool HaveKill = false; - unsigned Depth = 0; + // This doesn't actually need LiveIntervals, but we can preserve them. + LIS = getAnalysisIfAvailable(); - MachineFunction::iterator NextBB; + // FIXME: This is hack for fast regalloc. All live registers are spilled at + // the end of the block before the first terminator. We need to define a + // register to save exec, but can't insert a spill before then. Don't use the + // saveexec instructions and separately copy to exec to avoid needing to spill + // at the same time that a register is defined. + IsNoOpt = MF.getTarget().getOptLevel() == CodeGenOpt::None || + EnableNoSaveExecLowering; + + if (LIS) { + DEBUG(LIS->dump()); + } + MachineFunction::iterator NextBB; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; BI = NextBB) { NextBB = std::next(BI); MachineBasicBlock &MBB = *BI; - MachineBasicBlock *EmptyMBBAtEnd = nullptr; MachineBasicBlock::iterator I, Next; for (I = MBB.begin(); I != MBB.end(); I = Next) { Next = std::next(I); - MachineInstr &MI = *I; switch (MI.getOpcode()) { - default: break; - case AMDGPU::SI_IF: - ++Depth; - If(MI); - break; - - case AMDGPU::SI_ELSE: - Else(MI); - break; - - case AMDGPU::SI_BREAK: - Break(MI); - break; - - case AMDGPU::SI_IF_BREAK: - IfBreak(MI); - break; - - case AMDGPU::SI_ELSE_BREAK: - ElseBreak(MI); - break; - - case AMDGPU::SI_LOOP: - ++Depth; - Loop(MI); - break; - - case AMDGPU::SI_END_CF: - if (--Depth == 0 && HaveKill) { - HaveKill = false; - // TODO: Insert skip if exec is 0? - } - - EndCf(MI); - break; - - case AMDGPU::SI_KILL_TERMINATOR: - if (Depth == 0) { - if (skipIfDead(MI, *NextBB)) { - NextBB = std::next(BI); - BE = MF.end(); - } - } else - HaveKill = true; - Kill(MI); - break; - - case AMDGPU::S_BRANCH: - Branch(MI); - break; - - case AMDGPU::SI_RETURN: { - assert(!MF.getInfo()->returnsVoid()); - - // Graphics shaders returning non-void shouldn't contain S_ENDPGM, - // because external bytecode will be appended at the end. - if (BI != --MF.end() || I != MBB.getFirstTerminator()) { - // SI_RETURN is not the last instruction. Add an empty block at - // the end and jump there. - if (!EmptyMBBAtEnd) { - EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); - MF.insert(MF.end(), EmptyMBBAtEnd); - } - - MBB.addSuccessor(EmptyMBBAtEnd); - BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) - .addMBB(EmptyMBBAtEnd); - I->eraseFromParent(); - } - break; - } + case AMDGPU::SI_IF: + emitIf(MI); + break; + + case AMDGPU::SI_ELSE: + emitElse(MI); + break; + + case AMDGPU::SI_BREAK: + emitBreak(MI); + break; + + case AMDGPU::SI_IF_BREAK: + emitIfBreak(MI); + break; + + case AMDGPU::SI_ELSE_BREAK: + emitElseBreak(MI); + break; + + case AMDGPU::SI_LOOP: + emitLoop(MI); + break; + + case AMDGPU::SI_END_CF: + emitEndCf(MI); + break; + + default: + break; } } } + return true; } Index: test/CodeGen/AMDGPU/else.ll =================================================================== --- test/CodeGen/AMDGPU/else.ll +++ test/CodeGen/AMDGPU/else.ll @@ -25,11 +25,13 @@ } ; CHECK-LABEL: {{^}}else_execfix_leave_wqm: +; CHECK: ; BB#0: +; CHECK-NEXT: s_mov_b64 [[INIT_EXEC:s\[[0-9]+:[0-9]+\]]], exec ; CHECK: ; %Flow ; CHECK-NEXT: s_or_saveexec_b64 [[DST:s\[[0-9]+:[0-9]+\]]], -; CHECK-NEXT: s_and_b64 exec, exec, -; CHECK-NEXT: s_and_b64 [[DST]], exec, [[DST]] -; CHECK-NEXT: s_xor_b64 exec, exec, [[DST]] +; CHECK-NEXT: s_and_b64 exec, exec, [[INIT_EXEC]] +; CHECK-NEXT: s_and_b64 [[AND_INIT:s\[[0-9]+:[0-9]+\]]], exec, [[DST]] +; CHECK-NEXT: s_xor_b64 exec, exec, [[AND_INIT]] ; CHECK-NEXT: ; mask branch define amdgpu_ps void @else_execfix_leave_wqm(i32 %z, float %v) { main_body: Index: test/CodeGen/AMDGPU/valu-i1.ll =================================================================== --- test/CodeGen/AMDGPU/valu-i1.ll +++ test/CodeGen/AMDGPU/valu-i1.ll @@ -2,11 +2,33 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone -; SI-LABEL: @test_if +; SI-LABEL: {{^}}test_if: ; Make sure the i1 values created by the cfg structurizer pass are ; moved using VALU instructions + + +; waitcnt should be inserted after exec modification +; SI: v_cmp_lt_i32_e32 vcc, 0, +; SI-NEXT: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc +; SI-NEXT: s_xor_b64 [[SAVE]], exec, [[SAVE]] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]] +; SI-NEXT: s_cbranch_execz [[FLOW_BB]] + +; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock3 ; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1 ; SI: v_mov_b32_e32 v{{[0-9]}}, -1 +; SI: s_and_saveexec_b64 +; SI-NEXT: s_xor_b64 +; SI-NEXT: ; mask branch + +; v_mov should be after exec modification +; SI: [[FLOW_BB]]: +; SI-NEXT: s_or_saveexec_b64 [[SAVE]], [[SAVE]] +; SI-NEXT: v_mov_b32_e32 v{{[0-9]+}} +; SI-NEXT: s_xor_b64 exec, exec, [[SAVE]] +; SI-NEXT: ; mask branch +; define void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -17,12 +39,12 @@ case0: %arrayidx1 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b - store i32 0, i32 addrspace(1)* %arrayidx1, align 4 + store i32 13, i32 addrspace(1)* %arrayidx1, align 4 br label %end case1: %arrayidx5 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b - store i32 1, i32 addrspace(1)* %arrayidx5, align 4 + store i32 17, i32 addrspace(1)* %arrayidx5, align 4 br label %end default: @@ -31,11 +53,11 @@ br i1 %cmp8, label %if, label %else if: - store i32 2, i32 addrspace(1)* %arrayidx10, align 4 + store i32 19, i32 addrspace(1)* %arrayidx10, align 4 br label %end else: - store i32 3, i32 addrspace(1)* %arrayidx10, align 4 + store i32 21, i32 addrspace(1)* %arrayidx10, align 4 br label %end end: @@ -139,10 +161,11 @@ ; SI: s_or_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP]], [[COND_STATE]] ; SI: [[LABEL_FLOW]]: -; SI: s_or_b64 exec, exec, [[ORNEG2]] -; SI: s_or_b64 [[COND_STATE]], [[ORNEG2]], [[TMP]] -; SI: s_andn2_b64 exec, exec, [[COND_STATE]] -; SI: s_cbranch_execnz [[LABEL_LOOP]] +; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]] +; SI-NEXT: s_or_b64 exec, exec, [[ORNEG2]] +; SI-NEXT: s_or_b64 [[COND_STATE]], [[ORNEG2]], [[TMP]] +; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]] +; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]] ; SI: BB#5 ; SI: s_or_b64 exec, exec, [[COND_STATE]]