Index: lib/Target/AMDGPU/SIWholeQuadMode.cpp =================================================================== --- lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -84,7 +84,8 @@ enum { StateWQM = 0x1, - StateExact = 0x2, + StateWWM = 0x2, + StateExact = 0x4, }; struct PrintState { @@ -97,9 +98,14 @@ static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) { if (PS.State & StateWQM) OS << "WQM"; - if (PS.State & StateExact) { + if (PS.State & StateWWM) { if (PS.State & StateWQM) OS << '|'; + OS << "WWM"; + } + if (PS.State & StateExact) { + if (PS.State & (StateWQM | StateWWM)) + OS << '|'; OS << "Exact"; } @@ -128,6 +134,7 @@ class SIWholeQuadMode : public MachineFunctionPass { private: + CallingConv::ID callingConv; const SIInstrInfo *TII; const SIRegisterInfo *TRI; MachineRegisterInfo *MRI; @@ -141,7 +148,8 @@ void markInstruction(MachineInstr &MI, char Flag, std::vector &Worklist); - void markUsesWQM(const MachineInstr &MI, std::vector &Worklist); + void markInstructionUses(const MachineInstr &MI, char Flag, + std::vector &Worklist); char scanInstructions(MachineFunction &MF, std::vector &Worklist); void propagateInstruction(MachineInstr &MI, std::vector &Worklist); void propagateBlock(MachineBasicBlock &MBB, std::vector &Worklist); @@ -159,6 +167,10 @@ unsigned SaveWQM, unsigned LiveMaskReg); void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, unsigned SavedWQM); + void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, + unsigned SaveOrig); + void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, + unsigned SavedOrig); void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry); void lowerLiveMaskQueries(unsigned LiveMaskReg); @@ -218,7 +230,7 @@ std::vector &Worklist) { InstrInfo &II = Instructions[&MI]; - assert(Flag == StateWQM || Flag == StateExact); + assert(Flag == StateWQM || Flag == StateWWM || Flag == StateExact); // Ignore if the instruction is already marked. The typical case is that we // mark an instruction WQM multiple times, but for atomics it can happen that @@ -231,9 +243,9 @@ Worklist.push_back(&MI); } -/// Mark all instructions defining the uses in \p MI as WQM. -void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI, - std::vector &Worklist) { +/// Mark all instructions defining the uses in \p MI with the given Flag. +void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag, + std::vector &Worklist) { for (const MachineOperand &Use : MI.uses()) { if (!Use.isReg() || !Use.isUse()) continue; @@ -258,7 +270,7 @@ if (Value->isPHIDef()) continue; - markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM, + markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag, Worklist); } @@ -266,7 +278,7 @@ } for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) - markInstruction(DefMI, StateWQM, Worklist); + markInstruction(DefMI, Flag, Worklist); } } @@ -285,13 +297,23 @@ unsigned Opcode = MI.getOpcode(); char Flags = 0; - if (TII->isDS(Opcode)) { + if (TII->isDS(Opcode) && callingConv == CallingConv::AMDGPU_PS) { Flags = StateWQM; + } else if (TII->isDPP(Opcode)) { + unsigned wqmType = MI.getOperand(1).getImm(); + if (wqmType == 0) { + Flags = StateExact; + } else if (wqmType == 1) { + Flags = StateWQM; + } else { + assert(wqmType == 2); + Flags = StateWWM; + } } else if (TII->isWQM(Opcode)) { // Sampling instructions don't need to produce results for all pixels // in a quad, they just require all inputs of a quad to have been // computed for derivatives. - markUsesWQM(MI, Worklist); + markInstructionUses(MI, StateWQM, Worklist); GlobalFlags |= StateWQM; continue; } else if (TII->isDisableWQM(MI)) { @@ -345,14 +367,14 @@ // Propagate to block level BI.Needs |= II.Needs; - if ((BI.InNeeds | II.Needs) != BI.InNeeds) { - BI.InNeeds |= II.Needs; + if ((BI.InNeeds | (II.Needs & ~StateWWM)) != BI.InNeeds) { + BI.InNeeds |= (II.Needs & ~StateWWM); Worklist.push_back(MBB); } // Propagate backwards within block if (MachineInstr *PrevMI = MI.getPrevNode()) { - char InNeeds = II.Needs | II.OutNeeds; + char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds; if (!PrevMI->isPHI()) { InstrInfo &PrevII = Instructions[PrevMI]; if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) { @@ -362,11 +384,11 @@ } } - // Propagate WQM flag to instruction inputs + // Propagate WQM and WWM flags to instruction inputs assert(II.Needs != (StateWQM | StateExact)); - if (II.Needs == StateWQM) - markUsesWQM(MI, Worklist); + if (II.Needs != 0 && II.Needs != StateExact) + markInstructionUses(MI, II.Needs, Worklist); } void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB, @@ -558,6 +580,33 @@ LIS->InsertMachineInstrInMaps(*MI); } +void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Before, + unsigned SaveOrig) +{ + MachineInstr *MI; + + assert(SaveOrig); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveOrig) + .addReg(AMDGPU::EXEC); + LIS->InsertMachineInstrInMaps(*MI); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + .addImm(-1); + LIS->InsertMachineInstrInMaps(*MI); +} + +void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Before, + unsigned SavedOrig) +{ + MachineInstr *MI; + + assert(SavedOrig); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC) + .addReg(SavedOrig); + LIS->InsertMachineInstrInMaps(*MI); +} + void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry) { auto BII = Blocks.find(&MBB); @@ -566,19 +615,18 @@ const BlockInfo &BI = BII->second; - if (!(BI.InNeeds & StateWQM)) - return; - // This is a non-entry block that is WQM throughout, so no need to do // anything. - if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact) + if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) return; DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n"); unsigned SavedWQMReg = 0; + unsigned SavedNonWWMReg = 0; bool WQMFromExec = isEntry; char State = isEntry ? StateExact : StateWQM; + char NonWWMState = 0; auto II = MBB.getFirstNonPHI(), IE = MBB.end(); if (isEntry) @@ -625,7 +673,16 @@ prepareInsertion(MBB, First, II, Needs == StateWQM, Needs == StateExact || WQMFromExec); - if (Needs == StateExact) { + if (State == StateWWM) { + State = NonWWMState; + fromWWM(MBB, Before, SavedNonWWMReg); + } + + if (Needs == StateWWM) { + NonWWMState = State; + SavedNonWWMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + toWWM(MBB, Before, SavedNonWWMReg); + } else if (Needs == StateExact) { if (!WQMFromExec && (OutNeeds & StateWQM)) SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); @@ -667,12 +724,10 @@ } bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { - if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS) - return false; - Instructions.clear(); Blocks.clear(); LiveMaskQueries.clear(); + callingConv = MF.getFunction()->getCallingConv(); const SISubtarget &ST = MF.getSubtarget(); @@ -682,7 +737,7 @@ LIS = &getAnalysis(); char GlobalFlags = analyzeFunction(MF); - if (!(GlobalFlags & StateWQM)) { + if (!(GlobalFlags & (StateWQM | StateWWM))) { lowerLiveMaskQueries(AMDGPU::EXEC); return !LiveMaskQueries.empty(); }