Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -747,6 +747,15 @@ [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] >; +// Copies the active channels of the source value to the destination value, +// with the guarantee that the source value is computed as if the entire +// program were executed in Whole Wavefront Mode, i.e. with all channels +// enabled, with a few exceptions: - Phi nodes with require WWM return an +// undefined value. +def int_amdgcn_wwm : Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] +>; + //===----------------------------------------------------------------------===// // CI+ Intrinsics //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -48,6 +48,7 @@ FunctionPass *createSIDebuggerInsertNopsPass(); FunctionPass *createSIInsertWaitsPass(); FunctionPass *createSIInsertWaitcntsPass(); +FunctionPass *createSIFixWWMLivenessPass(); FunctionPass *createAMDGPUCodeGenPreparePass(); FunctionPass *createAMDGPUMachineCFGStructurizerPass(); @@ -97,6 +98,9 @@ void initializeSIOptimizeExecMaskingPass(PassRegistry &); extern char &SIOptimizeExecMaskingID; +void initializeSIFixWWMLivenessPass(PassRegistry &); +extern char &SIFixWWMLivenessID; + // Passes common to R600 and SI FunctionPass *createAMDGPUPromoteAlloca(); void initializeAMDGPUPromoteAllocaPass(PassRegistry&); Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -151,6 +151,7 @@ initializeSIInsertSkipsPass(*PR); initializeSIDebuggerInsertNopsPass(*PR); initializeSIOptimizeExecMaskingPass(*PR); + initializeSIFixWWMLivenessPass(*PR); initializeAMDGPUUnifyDivergentExitNodesPass(*PR); initializeAMDGPUAAWrapperPassPass(*PR); } @@ -762,6 +763,10 @@ // SI_ELSE will introduce a copy of the tied operand source after the else. insertPass(&PHIEliminationID, &SILowerControlFlowID, false); + // This must be run after SILowerControlFlow, since it needs to use the + // machine-level CFG, but before register allocation. + insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false); + TargetPassConfig::addFastRegAlloc(RegAllocPass); } @@ -775,6 +780,10 @@ // SI_ELSE will introduce a copy of the tied operand source after the else. insertPass(&PHIEliminationID, &SILowerControlFlowID, false); + // This must be run after SILowerControlFlow, since it needs to use the + // machine-level CFG, but before register allocation. + insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false); + TargetPassConfig::addOptimizedRegAlloc(RegAllocPass); } Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -80,6 +80,7 @@ SIFixControlFlowLiveIntervals.cpp SIFixSGPRCopies.cpp SIFixVGPRCopies.cpp + SIFixWWMLiveness.cpp SIFoldOperands.cpp SIFrameLowering.cpp SIInsertSkips.cpp Index: lib/Target/AMDGPU/SIFixSGPRCopies.cpp =================================================================== --- lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -568,7 +568,8 @@ default: continue; case AMDGPU::COPY: - case AMDGPU::WQM: { + case AMDGPU::WQM: + case AMDGPU::WWM: { // If the destination register is a physical register there isn't really // much we can do to fix this. if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg())) Index: lib/Target/AMDGPU/SIFixWWMLiveness.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/SIFixWWMLiveness.cpp @@ -0,0 +1,202 @@ +//===-- SIFixWWMLiveness.cpp - Fix WWM live intervals ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Computations in WWM can overwrite values in inactive channels for +/// variables that the register allocator thinks are dead. This pass adds fake +/// uses of those variables to WWM instructions to make sure that they aren't +/// overwritten. +/// +/// As an example, consider this snippet: +/// %vgpr0 = V_MOV_B32_e32 0.0 +/// if (...) { +/// %vgpr1 = ... +/// %vgpr2 = WWM %vgpr1 +/// ... = %vgpr2 +/// %vgpr0 = V_MOV_B32_e32 1.0 +/// } +/// ... = %vgpr0 +/// +/// The live intervals of %vgpr0 don't overlap with those of %vgpr1. Normally, +/// we can safely allocate %vgpr0 and %vgpr1 in the same register, since +/// writing %vgpr1 would only write to channels that would be clobbered by the +/// second write to %vgpr0 anyways. But if %vgpr1 is written with WWM enabled, +/// it would clobber even the inactive channels for which the if-condition is +/// false, for which %vgpr0 is supposed to be 0. This pass adds an implicit use +/// of %vgpr0 to the WWM instruction to make sure they aren't allocated to the +/// same register. +/// +/// In general, we need to figure out what registers might have their inactive +/// channels which are eventually used accidentally clobbered by a WWM +/// instruction. We approximate this using two conditions: +/// +/// 1. A definition of the variable reaches the WWM instruction. +/// 2. The variable would be live at the WWM instruction if all its defs were +/// partial defs (i.e. considered as a use), ignoring normal uses. +/// +/// If a register matches both conditions, then we add an implicit use of it to +/// the WWM instruction. Condition #2 is the heart of the matter: every +/// definition is really a partial definition, since every VALU instruction is +/// implicitly predicated. We can usually ignore this, but WWM forces us not +/// to. Condition #1 prevents false positives if the variable is undefined at +/// the WWM instruction anyways. This is overly conservative in certain cases, +/// especially in uniform control flow, but this is a workaround anyways until +/// LLVM gains the notion of predicated uses and definitions of variables. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SparseBitVector.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Target/TargetRegisterInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-fix-wwm-liveness" + +namespace { + +class SIFixWWMLiveness : public MachineFunctionPass { +private: + LiveIntervals *LIS = nullptr; + const SIRegisterInfo *TRI; + MachineRegisterInfo *MRI; + +public: + static char ID; + + SIFixWWMLiveness() : MachineFunctionPass(ID) { + initializeSIFixWWMLivenessPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + bool runOnWWMInstruction(MachineInstr &MI); + + void addDefs(const MachineInstr &MI, SparseBitVector<> &set); + + StringRef getPassName() const override { return "SI Fix WWM Liveness"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + // Should preserve the same set that TwoAddressInstructions does. + AU.addPreserved(); + AU.addPreserved(); + AU.addPreservedID(LiveVariablesID); + AU.addPreservedID(MachineLoopInfoID); + AU.addPreservedID(MachineDominatorsID); + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS(SIFixWWMLiveness, DEBUG_TYPE, + "SI fix WWM liveness", false, false) + +char SIFixWWMLiveness::ID = 0; + +char &llvm::SIFixWWMLivenessID = SIFixWWMLiveness::ID; + +FunctionPass *llvm::createSIFixWWMLivenessPass() { + return new SIFixWWMLiveness(); +} + +void SIFixWWMLiveness::addDefs(const MachineInstr &MI, SparseBitVector<> &Regs) +{ + for (const MachineOperand &Op : MI.defs()) { + if (Op.isReg()) { + unsigned Reg = Op.getReg(); + if (TRI->isVGPR(*MRI, Reg)) + Regs.set(Reg); + } + } +} + +bool SIFixWWMLiveness::runOnWWMInstruction(MachineInstr &WWM) { + MachineBasicBlock *MBB = WWM.getParent(); + + // Compute the registers that are live out of MI by figuring out which defs + // are reachable from MI. + SparseBitVector<> LiveOut; + + for (auto II = MachineBasicBlock::iterator(WWM), IE = + MBB->end(); II != IE; ++II) { + addDefs(*II, LiveOut); + } + + for (df_iterator I = ++df_begin(MBB), + E = df_end(MBB); + I != E; ++I) { + for (const MachineInstr &MI : **I) { + addDefs(MI, LiveOut); + } + } + + // Compute the registers that reach MI. + SparseBitVector<> Reachable; + + for (auto II = ++MachineBasicBlock::reverse_iterator(WWM), IE = + MBB->rend(); II != IE; ++II) { + addDefs(*II, Reachable); + } + + for (idf_iterator I = ++idf_begin(MBB), + E = idf_end(MBB); + I != E; ++I) { + for (const MachineInstr &MI : **I) { + addDefs(MI, Reachable); + } + } + + // find the intersection, and add implicit uses. + LiveOut &= Reachable; + + bool Modified = false; + for (unsigned Reg : LiveOut) { + WWM.addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true)); + if (LIS) { + // FIXME: is there a better way to update the live interval? + LIS->removeInterval(Reg); + LIS->createAndComputeVirtRegInterval(Reg); + } + Modified = true; + } + + return Modified; +} + +bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) { + bool modified = false; + + // This doesn't actually need LiveIntervals, but we can preserve them. + LIS = getAnalysisIfAvailable(); + + const SISubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + + TRI = &TII->getRegisterInfo(); + MRI = &MF.getRegInfo(); + + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() == AMDGPU::EXIT_WWM) { + modified |= runOnWWMInstruction(MI); + } + } + } + + return modified; +} Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -3295,6 +3295,11 @@ return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src), 0); } + case Intrinsic::amdgcn_wwm: { + SDValue Src = Op.getOperand(1); + return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src), + 0); + } default: return Op; } Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1145,6 +1145,12 @@ MI.eraseFromParent(); break; } + case AMDGPU::EXIT_WWM: { + // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM + // is exited. + MI.setDesc(get(AMDGPU::S_MOV_B64)); + break; + } } return true; } @@ -2654,6 +2660,7 @@ case AMDGPU::PHI: return AMDGPU::PHI; case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; case AMDGPU::WQM: return AMDGPU::WQM; + case AMDGPU::WWM: return AMDGPU::WWM; case AMDGPU::S_MOV_B32: return MI.getOperand(1).isReg() ? AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; @@ -3959,6 +3966,7 @@ case AMDGPU::REG_SEQUENCE: case AMDGPU::INSERT_SUBREG: case AMDGPU::WQM: + case AMDGPU::WWM: if (RI.hasVGPRs(NewDstRC)) return nullptr; Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -116,12 +116,26 @@ def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst), (ins VSrc_b64:$src0)>; -// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy -// after the WQM pass processes them. +// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the +// WQM pass processes it. def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; +// Pseudoinstruction for @llvm.amdgcn.wwm. It is turned into a copy post-RA, so +// that the @earlyclobber is respected. The @earlyclobber is to make sure that +// the instruction that defines $src0 (which is run in WWM) doesn't +// accidentally clobber inactive channels of $vdst. +let Constraints = "@earlyclobber $vdst" in { +def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; +} + } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] +def EXIT_WWM : SPseudoInstSI <(outs SReg_64:$exec), (ins SReg_64:$src0)> { + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; +} + let usesCustomInserter = 1, SALU = 1 in { def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins), [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>; Index: lib/Target/AMDGPU/SIWholeQuadMode.cpp =================================================================== --- lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -9,7 +9,7 @@ // /// \file /// \brief This pass adds instructions to enable whole quad mode for pixel -/// shaders. +/// shaders, and whole wavefront mode for all programs. /// /// Whole quad mode is required for derivative computations, but it interferes /// with shader side effects (stores and atomics). This pass is run on the @@ -29,6 +29,13 @@ /// ... /// S_MOV_B64 EXEC, Tmp /// +/// We also compute when a sequence of instructions requires Whole Wavefront +/// Mode (WWM) and insert instructions to save and restore it: +/// +/// S_OR_SAVEEXEC_B64 Tmp, -1 +/// ... +/// S_MOV_B64 EXEC, Tmp +/// /// In order to avoid excessive switching during sequences of Exact /// instructions, the pass first analyzes which instructions must be run in WQM /// (aka which instructions produce values that lead to derivative @@ -54,6 +61,7 @@ #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/LiveInterval.h" @@ -84,7 +92,8 @@ enum { StateWQM = 0x1, - StateExact = 0x2, + StateWWM = 0x2, + StateExact = 0x4, }; struct PrintState { @@ -97,9 +106,14 @@ static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) { if (PS.State & StateWQM) OS << "WQM"; - if (PS.State & StateExact) { + if (PS.State & StateWWM) { if (PS.State & StateWQM) OS << '|'; + OS << "WWM"; + } + if (PS.State & StateExact) { + if (PS.State & (StateWQM | StateWWM)) + OS << '|'; OS << "Exact"; } @@ -108,6 +122,7 @@ struct InstrInfo { char Needs = 0; + char Disabled = 0; char OutNeeds = 0; }; @@ -128,6 +143,7 @@ class SIWholeQuadMode : public MachineFunctionPass { private: + CallingConv::ID CallingConv; const SIInstrInfo *TII; const SIRegisterInfo *TRI; MachineRegisterInfo *MRI; @@ -142,7 +158,8 @@ void markInstruction(MachineInstr &MI, char Flag, std::vector &Worklist); - void markUsesWQM(const MachineInstr &MI, std::vector &Worklist); + void markInstructionUses(const MachineInstr &MI, char Flag, + std::vector &Worklist); char scanInstructions(MachineFunction &MF, std::vector &Worklist); void propagateInstruction(MachineInstr &MI, std::vector &Worklist); void propagateBlock(MachineBasicBlock &MBB, std::vector &Worklist); @@ -160,6 +177,10 @@ unsigned SaveWQM, unsigned LiveMaskReg); void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, unsigned SavedWQM); + void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, + unsigned SaveOrig); + void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, + unsigned SavedOrig); void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry); void lowerLiveMaskQueries(unsigned LiveMaskReg); @@ -220,22 +241,26 @@ std::vector &Worklist) { InstrInfo &II = Instructions[&MI]; - assert(Flag == StateWQM || Flag == StateExact); + assert(!(Flag & StateExact) && Flag != 0); - // Ignore if the instruction is already marked. The typical case is that we - // mark an instruction WQM multiple times, but for atomics it can happen that - // Flag is StateWQM, but Needs is already set to StateExact. In this case, - // letting the atomic run in StateExact is correct as per the relevant specs. - if (II.Needs) + // Remove any disabled states from the flag. The user that required it gets + // an undefined value in the helper lanes. For example, this can happen if + // the result of an atomic is used by instruction that requires WQM, where + // ignoring the request for WQM is correct as per the relevant specs. + Flag &= ~II.Disabled; + + // Ignore if the flag is already encompassed by the existing needs, or we + // just disabled everything. + if ((II.Needs & Flag) == Flag) return; - II.Needs = Flag; + II.Needs |= Flag; Worklist.push_back(&MI); } -/// Mark all instructions defining the uses in \p MI as WQM. -void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI, - std::vector &Worklist) { +/// Mark all instructions defining the uses in \p MI with \p Flag. +void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag, + std::vector &Worklist) { for (const MachineOperand &Use : MI.uses()) { if (!Use.isReg() || !Use.isUse()) continue; @@ -260,7 +285,7 @@ if (Value->isPHIDef()) continue; - markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM, + markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag, Worklist); } @@ -268,7 +293,7 @@ } for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) - markInstruction(DefMI, StateWQM, Worklist); + markInstruction(DefMI, Flag, Worklist); } } @@ -279,21 +304,28 @@ char GlobalFlags = 0; bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs"); - for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { - MachineBasicBlock &MBB = *BI; + // We need to visit the basic blocks in reverse post-order so that we visit + // defs before uses, in particular so that we don't accidentally mark an + // instruction as needing e.g. WQM before visiting it and realizing it needs + // WQM disabled. + ReversePostOrderTraversal RPOT(&MF); + for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) { + MachineBasicBlock &MBB = **BI; + BlockInfo &BBI = Blocks[&MBB]; for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) { MachineInstr &MI = *II; + InstrInfo &III = Instructions[&MI]; unsigned Opcode = MI.getOpcode(); char Flags = 0; - if (TII->isDS(Opcode)) { + if (TII->isDS(Opcode) && CallingConv == CallingConv::AMDGPU_PS) { Flags = StateWQM; } else if (TII->isWQM(Opcode)) { // Sampling instructions don't need to produce results for all pixels // in a quad, they just require all inputs of a quad to have been // computed for derivatives. - markUsesWQM(MI, Worklist); + markInstructionUses(MI, StateWQM, Worklist); GlobalFlags |= StateWQM; continue; } else if (Opcode == AMDGPU::WQM) { @@ -301,8 +333,23 @@ // correct, so we need it to be in WQM. Flags = StateWQM; LowerToCopyInstrs.push_back(&MI); + } else if (Opcode == AMDGPU::WWM) { + // The WWM intrinsic doesn't make the same guarantee, and plus it needs + // to be executed in WQM or Exact so that its copy doesn't clobber + // inactive lanes. + markInstructionUses(MI, StateWWM, Worklist); + GlobalFlags |= StateWWM; + LowerToCopyInstrs.push_back(&MI); + continue; } else if (TII->isDisableWQM(MI)) { - Flags = StateExact; + BBI.Needs |= StateExact; + if (!(BBI.InNeeds & StateExact)) { + BBI.InNeeds |= StateExact; + Worklist.push_back(&MBB); + } + GlobalFlags |= StateExact; + III.Disabled = StateWQM | StateWWM; + continue; } else { if (Opcode == AMDGPU::SI_PS_LIVE) { LiveMaskQueries.push_back(&MI); @@ -344,22 +391,24 @@ // Control flow-type instructions and stores to temporary memory that are // followed by WQM computations must themselves be in WQM. - if ((II.OutNeeds & StateWQM) && !II.Needs && + if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) && (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) { Instructions[&MI].Needs = StateWQM; II.Needs = StateWQM; } // Propagate to block level - BI.Needs |= II.Needs; - if ((BI.InNeeds | II.Needs) != BI.InNeeds) { - BI.InNeeds |= II.Needs; - Worklist.push_back(MBB); + if (II.Needs & StateWQM) { + BI.Needs |= StateWQM; + if (!(BI.InNeeds & StateWQM)) { + BI.InNeeds |= StateWQM; + Worklist.push_back(MBB); + } } // Propagate backwards within block if (MachineInstr *PrevMI = MI.getPrevNode()) { - char InNeeds = II.Needs | II.OutNeeds; + char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds; if (!PrevMI->isPHI()) { InstrInfo &PrevII = Instructions[PrevMI]; if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) { @@ -370,10 +419,10 @@ } // Propagate WQM flag to instruction inputs - assert(II.Needs != (StateWQM | StateExact)); + assert(!(II.Needs & StateExact)); - if (II.Needs == StateWQM) - markUsesWQM(MI, Worklist); + if (II.Needs != 0) + markInstructionUses(MI, II.Needs, Worklist); } void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB, @@ -565,6 +614,31 @@ LIS->InsertMachineInstrInMaps(*MI); } +void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Before, + unsigned SaveOrig) +{ + MachineInstr *MI; + + assert(SaveOrig); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_OR_SAVEEXEC_B64), + SaveOrig) + .addImm(-1); + LIS->InsertMachineInstrInMaps(*MI); +} + +void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Before, + unsigned SavedOrig) +{ + MachineInstr *MI; + + assert(SavedOrig); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), AMDGPU::EXEC) + .addReg(SavedOrig); + LIS->InsertMachineInstrInMaps(*MI); +} + void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry) { auto BII = Blocks.find(&MBB); @@ -573,45 +647,66 @@ const BlockInfo &BI = BII->second; - if (!(BI.InNeeds & StateWQM)) - return; - // This is a non-entry block that is WQM throughout, so no need to do // anything. - if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact) + if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) return; DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n"); unsigned SavedWQMReg = 0; + unsigned SavedNonWWMReg = 0; bool WQMFromExec = isEntry; - char State = isEntry ? StateExact : StateWQM; + char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM; + char NonWWMState = 0; auto II = MBB.getFirstNonPHI(), IE = MBB.end(); if (isEntry) ++II; // Skip the instruction that saves LiveMask - MachineBasicBlock::iterator First = IE; + // This stores the first instruction where it's safe to switch from WQM to + // Exact or vice versa. + MachineBasicBlock::iterator FirstWQM = IE; + + // This stores the first instruction where it's safe to switch from WWM to + // Exact/WQM or to switch to WWM. It must always be the same as, or after, + // FirstWQM since if it's safe to switch to/from WWM, it must be safe to + // switch to/from WQM as well. + MachineBasicBlock::iterator FirstWWM = IE; for (;;) { MachineBasicBlock::iterator Next = II; - char Needs = 0; + char Needs = StateExact | StateWQM; // WWM is disabled by default char OutNeeds = 0; - if (First == IE) - First = II; + if (FirstWQM == IE) + FirstWQM = II; + if (FirstWWM == IE) + FirstWWM = II; + + // First, figure out the allowed states (Needs) based on the propagated + // flags. if (II != IE) { MachineInstr &MI = *II; if (requiresCorrectState(MI)) { auto III = Instructions.find(&MI); if (III != Instructions.end()) { - Needs = III->second.Needs; + if (III->second.Needs & StateWWM) + Needs = StateWWM; + else if (III->second.Needs & StateWQM) + Needs = StateWQM; + else + Needs &= ~III->second.Disabled; OutNeeds = III->second.OutNeeds; } + } else { + // If the instruction doesn't actually need a correct EXEC, then we can + // safely leave WWM enabled. + Needs = StateExact | StateWQM | StateWWM; } - if (MI.isTerminator() && !Needs && OutNeeds == StateExact) + if (MI.isTerminator() && OutNeeds == StateExact) Needs = StateExact; if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact) @@ -624,20 +719,45 @@ Needs = StateWQM; else if (BI.OutNeeds == StateExact) Needs = StateExact; + else + Needs = StateWQM | StateExact; } - if (Needs) { - if (Needs != State) { - MachineBasicBlock::iterator Before = - prepareInsertion(MBB, First, II, Needs == StateWQM, - Needs == StateExact || WQMFromExec); + // Now, transition if necessary. + if (!(Needs & State)) { + MachineBasicBlock::iterator First; + if (State == StateWWM || Needs == StateWWM) { + // We must switch to or from WWM + First = FirstWWM; + } else { + // We only need to switch to/from WQM, so we can use FirstWQM + First = FirstWQM; + } + + MachineBasicBlock::iterator Before = + prepareInsertion(MBB, First, II, Needs == StateWQM, + Needs == StateExact || WQMFromExec); + + if (State == StateWWM) { + assert(SavedNonWWMReg); + fromWWM(MBB, Before, SavedNonWWMReg); + State = NonWWMState; + } - if (Needs == StateExact) { + if (Needs == StateWWM) { + NonWWMState = State; + SavedNonWWMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + toWWM(MBB, Before, SavedNonWWMReg); + State = StateWWM; + } else { + if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) { if (!WQMFromExec && (OutNeeds & StateWQM)) SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); toExact(MBB, Before, SavedWQMReg, LiveMaskReg); - } else { + State = StateExact; + } else if (State == StateExact && (Needs & StateWQM) && + !(Needs & StateExact)) { assert(WQMFromExec == (SavedWQMReg == 0)); toWQM(MBB, Before, SavedWQMReg); @@ -646,12 +766,19 @@ LIS->createAndComputeVirtRegInterval(SavedWQMReg); SavedWQMReg = 0; } + State = StateWQM; + } else { + // We can get here if we transitioned from WWM to a non-WWM state that + // already matches our needs, but we shouldn't need to do anything. + assert(Needs & State); } - - State = Needs; } + } - First = IE; + if (Needs != (StateExact | StateWQM | StateWWM)) { + if (Needs != (StateExact | StateWQM)) + FirstWQM = IE; + FirstWWM = IE; } if (II == IE) @@ -679,13 +806,11 @@ } bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { - if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS) - return false; - Instructions.clear(); Blocks.clear(); LiveMaskQueries.clear(); LowerToCopyInstrs.clear(); + CallingConv = MF.getFunction()->getCallingConv(); const SISubtarget &ST = MF.getSubtarget(); @@ -695,14 +820,13 @@ LIS = &getAnalysis(); char GlobalFlags = analyzeFunction(MF); + unsigned LiveMaskReg = 0; if (!(GlobalFlags & StateWQM)) { lowerLiveMaskQueries(AMDGPU::EXEC); - return !LiveMaskQueries.empty(); - } - - // Store a copy of the original live mask when required - unsigned LiveMaskReg = 0; - { + if (!(GlobalFlags & StateWWM)) + return !LiveMaskQueries.empty(); + } else { + // Store a copy of the original live mask when required MachineBasicBlock &Entry = MF.front(); MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); @@ -714,13 +838,14 @@ LIS->InsertMachineInstrInMaps(*MI); } + lowerLiveMaskQueries(LiveMaskReg); + if (GlobalFlags == StateWQM) { // For a shader that needs only WQM, we can just set it once. BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC) .addReg(AMDGPU::EXEC); - lowerLiveMaskQueries(LiveMaskReg); lowerCopyInstrs(); // EntryMI may become invalid here return true; @@ -729,7 +854,6 @@ DEBUG(printInfo()); - lowerLiveMaskQueries(LiveMaskReg); lowerCopyInstrs(); // Handle the general case Index: test/CodeGen/AMDGPU/fix-wwm-liveness.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/fix-wwm-liveness.mir @@ -0,0 +1,93 @@ +# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-fix-wwm-liveness -o - %s | FileCheck %s +#CHECK: %exec = EXIT_WWM killed %19, implicit %21 + +--- +name: test_wwm_liveness +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_64, preferred-register: '' } + - { id: 1, class: sgpr_32, preferred-register: '' } + - { id: 2, class: sgpr_32, preferred-register: '' } + - { id: 3, class: vgpr_32, preferred-register: '' } + - { id: 4, class: vgpr_32, preferred-register: '' } + - { id: 5, class: vgpr_32, preferred-register: '' } + - { id: 6, class: vgpr_32, preferred-register: '' } + - { id: 7, class: vgpr_32, preferred-register: '' } + - { id: 8, class: sreg_64, preferred-register: '%vcc' } + - { id: 9, class: sreg_64, preferred-register: '' } + - { id: 10, class: sreg_32_xm0, preferred-register: '' } + - { id: 11, class: sreg_64, preferred-register: '' } + - { id: 12, class: sreg_32_xm0, preferred-register: '' } + - { id: 13, class: sreg_32_xm0, preferred-register: '' } + - { id: 14, class: sreg_32_xm0, preferred-register: '' } + - { id: 15, class: sreg_128, preferred-register: '' } + - { id: 16, class: vgpr_32, preferred-register: '' } + - { id: 17, class: vgpr_32, preferred-register: '' } + - { id: 18, class: vgpr_32, preferred-register: '' } + - { id: 19, class: sreg_64, preferred-register: '' } + - { id: 20, class: sreg_64, preferred-register: '' } + - { id: 21, class: vgpr_32, preferred-register: '' } + - { id: 22, class: sreg_64, preferred-register: '' } + - { id: 23, class: sreg_64, preferred-register: '' } +liveins: +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + + %21 = V_MOV_B32_e32 0, implicit %exec + %5 = V_MBCNT_LO_U32_B32_e64 -1, 0, implicit %exec + %6 = V_MBCNT_HI_U32_B32_e32 -1, killed %5, implicit %exec + %8 = V_CMP_GT_U32_e64 32, killed %6, implicit %exec + %22 = COPY %exec, implicit-def %exec + %23 = S_AND_B64 %22, %8, implicit-def dead %scc + %0 = S_XOR_B64 %23, %22, implicit-def dead %scc + %exec = S_MOV_B64_term killed %23 + SI_MASK_BRANCH %bb.2, implicit %exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2(0x80000000) + + %13 = S_MOV_B32 61440 + %14 = S_MOV_B32 -1 + %15 = REG_SEQUENCE undef %12, 1, undef %10, 2, killed %14, 3, killed %13, 4 + %19 = COPY %exec + %exec = S_MOV_B64 -1 + %16 = BUFFER_LOAD_DWORD_OFFSET %15, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4) + %17 = V_ADD_F32_e32 1065353216, killed %16, implicit %exec + %exec = EXIT_WWM killed %19 + %21 = V_MOV_B32_e32 1, implicit %exec + early-clobber %18 = WWM killed %17, implicit %exec + BUFFER_STORE_DWORD_OFFSET killed %18, killed %15, 0, 0, 0, 0, 0, implicit %exec :: (store 4) + + bb.2: + %exec = S_OR_B64 %exec, killed %0, implicit-def %scc + %vgpr0 = COPY killed %21 + SI_RETURN_TO_EPILOG killed %vgpr0 + +... Index: test/CodeGen/AMDGPU/wqm.ll =================================================================== --- test/CodeGen/AMDGPU/wqm.ll +++ test/CodeGen/AMDGPU/wqm.ll @@ -108,6 +108,135 @@ ret float %out.2 } +; Check that WWM is triggered by the wwm intrinsic. +; +;CHECK-LABEL: {{^}}test_wwm1: +;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 +;CHECK: buffer_load_dword +;CHECK: buffer_load_dword +;CHECK: v_add_f32_e32 +define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) + %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) + ret float %out.0 +} + +; Check that we don't leave WWM on for computations that don't require WWM, +; since that will lead clobbering things that aren't supposed to be clobbered +; in cases like this. +; +;CHECK-LABEL: {{^}}test_wwm2: +;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 +;CHECK: buffer_load_dword +;CHECK: v_add_f32_e32 +;CHECK: s_mov_b64 exec, [[ORIG]] +;CHECK: v_add_f32_e32 +define amdgpu_ps float @test_wwm2(i32 inreg %idx) { +main_body: + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %cc = icmp uge i32 %hi, 32 + br i1 %cc, label %endif, label %if + +if: + %src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) + %out = fadd float %src, %src + %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) + %out.1 = fadd float %src, %out.0 + br label %endif + +endif: + %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ] + ret float %out.2 +} + +; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM +; write could clobber disabled channels in the non-WWM one. +; +;CHECK-LABEL: {{^}}test_wwm3: +;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 +;CHECK: buffer_load_dword +;CHECK: v_add_f32_e32 +;CHECK: s_mov_b64 exec, [[ORIG]] +;CHECK-NEXT: v_mov_b32_e32 +define amdgpu_ps float @test_wwm3(i32 inreg %idx) { +main_body: + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %cc = icmp uge i32 %hi, 32 + br i1 %cc, label %endif, label %if + +if: + %src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) + %out = fadd float %src, %src + %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) + br label %endif + +endif: + %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] + ret float %out.1 +} + +; Make sure the transition from Exact to WWM then WQM works properly. +; +;CHECK-LABEL: {{^}}test_wwm4: +;CHECK: buffer_load_dword +;CHECK: buffer_store_dword +;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 +;CHECK: buffer_load_dword +;CHECK: v_add_f32_e32 +;CHECK: s_mov_b64 exec, [[ORIG]] +;CHECK: s_wqm_b64 exec, exec +define amdgpu_ps float @test_wwm4(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) + %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) + %temp = fadd float %src1, %src1 + %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp) + %out = fadd float %temp.0, %temp.0 + %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) + ret float %out.0 +} + +; Check that WWM is turned on correctly across basic block boundaries. +; +;CHECK-LABEL: {{^}}test_wwm5: +;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 +;SI-CHECK: buffer_load_dword +;VI-CHECK: flat_load_dword +;CHECK: s_mov_b64 exec, [[ORIG]] +;CHECK: %if +;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1 +;SI-CHECK: buffer_load_dword +;VI-CHECK: flat_load_dword +;CHECK: v_add_f32_e32 +;CHECK: s_mov_b64 exec, [[ORIG2]] +define amdgpu_ps float @test_wwm5() { +main_body: + %src0 = load volatile float, float addrspace(1)* undef + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %cc = icmp uge i32 %hi, 32 + br i1 %cc, label %endif, label %if + +if: + %src1 = load volatile float, float addrspace(1)* undef + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) + br label %endif + +endif: + %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] + ret float %out.1 +} + ; Check a case of one branch of an if-else requiring WQM, the other requiring ; exact. ; @@ -530,6 +659,9 @@ declare void @llvm.AMDGPU.kill(float) #1 declare float @llvm.amdgcn.wqm.f32(float) #3 declare i32 @llvm.amdgcn.wqm.i32(i32) #3 +declare float @llvm.amdgcn.wwm.f32(float) #3 +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3 +declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3 attributes #1 = { nounwind } attributes #2 = { nounwind readonly } Index: test/CodeGen/AMDGPU/wqm.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/wqm.mir @@ -0,0 +1,70 @@ +# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-wqm -o - %s | FileCheck %s + +--- +# Check for awareness that s_or_saveexec_b64 clobbers SCC +# +#CHECK: S_OR_SAVEEXEC_B64 +#CHECK: S_CMP_LT_I32 +#CHECK: S_CSELECT_B32 +name: test_wwm_scc +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_32, preferred-register: '' } + - { id: 1, class: sgpr_32, preferred-register: '' } + - { id: 2, class: sgpr_32, preferred-register: '' } + - { id: 3, class: vgpr_32, preferred-register: '' } + - { id: 4, class: vgpr_32, preferred-register: '' } + - { id: 5, class: sgpr_32, preferred-register: '' } + - { id: 6, class: vgpr_32, preferred-register: '' } + - { id: 7, class: vgpr_32, preferred-register: '' } + - { id: 8, class: sreg_32_xm0, preferred-register: '' } + - { id: 9, class: sreg_32, preferred-register: '' } + - { id: 10, class: sreg_32, preferred-register: '' } + - { id: 11, class: vgpr_32, preferred-register: '' } + - { id: 12, class: vgpr_32, preferred-register: '' } +liveins: + - { reg: '%sgpr0', virtual-reg: '%0' } + - { reg: '%sgpr1', virtual-reg: '%1' } + - { reg: '%sgpr2', virtual-reg: '%2' } + - { reg: '%vgpr0', virtual-reg: '%3' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0: + liveins: %sgpr0, %sgpr1, %sgpr2, %vgpr0 + + %3 = COPY %vgpr0 + %2 = COPY %sgpr2 + %1 = COPY %sgpr1 + %0 = COPY %sgpr0 + S_CMP_LT_I32 0, %0, implicit-def %scc + %12 = V_ADD_I32_e32 %3, %3, implicit-def %vcc, implicit %exec + %5 = S_CSELECT_B32 %2, %1, implicit %scc + %11 = V_ADD_I32_e32 %5, %12, implicit-def %vcc, implicit %exec + %vgpr0 = WWM %11, implicit %exec + SI_RETURN_TO_EPILOG %vgpr0 + +...