Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -40,6 +40,11 @@ #undef AMDGPUSubtarget #include "R600GenSubtargetInfo.inc" +static cl::opt DisablePowerSched( + "amdgpu-disable-power-sched", + cl::desc("Disable scheduling to minimize mAI power bursts"), + cl::init(false)); + GCNSubtarget::~GCNSubtarget() = default; R600Subtarget & @@ -751,11 +756,130 @@ } } }; + +struct FillMFMAShadowMutation : ScheduleDAGMutation { + const SIInstrInfo *TII; + + ScheduleDAGMI *DAG; + + FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} + + bool isSALU(const SUnit *SU) const { + const MachineInstr &MI = *SU->getInstr(); + return TII->isSALU(MI) && !MI.isTerminator(); + } + + bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { + if (Pred->NodeNum < Succ->NodeNum) + return true; + + SmallVector Succs({Succ}), Preds({Pred}); + + for (unsigned I = 0; I < Succs.size(); ++I) { + for (const SDep &SI : Succs[I]->Succs) { + const SUnit *SU = SI.getSUnit(); + if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end()) + Succs.push_back(SU); + } + } + + SmallPtrSet Visited; + while (!Preds.empty()) { + const SUnit *SU = Preds.pop_back_val(); + if (llvm::find(Succs, SU) != Succs.end()) + return false; + Visited.insert(SU); + for (const SDep &SI : SU->Preds) + if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) + Preds.push_back(SI.getSUnit()); + } + + return true; + } + + // Link as much SALU intructions in chain as possible. Return the size + // of the chain. Links up to MaxChain instructions. + unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, + SmallPtrSetImpl &Visited) const { + SmallVector Worklist({To}); + unsigned Linked = 0; + + while (!Worklist.empty() && MaxChain-- > 0) { + SUnit *SU = Worklist.pop_back_val(); + if (!Visited.insert(SU).second) + continue; + + LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); + dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); + + if (SU->addPred(SDep(From, SDep::Artificial), false)) + ++Linked; + + for (SDep &SI : From->Succs) { + SUnit *SUv = SI.getSUnit(); + if (SUv != From && TII->isVALU(*SUv->getInstr()) && canAddEdge(SUv, SU)) + SUv->addPred(SDep(SU, SDep::Artificial), false); + } + + for (SDep &SI : SU->Succs) { + SUnit *Succ = SI.getSUnit(); + if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) + Worklist.push_back(Succ); + } + } + + return Linked; + } + + void apply(ScheduleDAGInstrs *DAGInstrs) override { + const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget(); + if (!ST.hasMAIInsts() || DisablePowerSched) + return; + DAG = static_cast(DAGInstrs); + const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); + if (!TSchedModel || DAG->SUnits.empty()) + return; + + // Scan for MFMA long latency instructions and try to add a dependency + // of available SALU instructions to give them a chance to fill MFMA + // shadow. That is desirable to fill MFMA shadow with SALU instructions + // rather than VALU to prevent power consumption bursts and throttle. + auto LastSALU = DAG->SUnits.begin(); + auto E = DAG->SUnits.end(); + SmallPtrSet Visited; + for (SUnit &SU : DAG->SUnits) { + MachineInstr &MAI = *SU.getInstr(); + if (!TII->isMAI(MAI) || + MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 || + MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32) + continue; + + unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; + + LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); + dbgs() << "Need " << Lat + << " instructions to cover latency.\n"); + + // Find up to Lat independent scalar instructions as early as + // possible such that they can be scheduled after this MFMA. + for ( ; Lat && LastSALU != E; ++LastSALU) { + if (Visited.count(&*LastSALU)) + continue; + + if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) + continue; + + Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); + } + } + } +}; } // namespace void GCNSubtarget::getPostRAMutations( std::vector> &Mutations) const { Mutations.push_back(llvm::make_unique(&InstrInfo)); + Mutations.push_back(llvm::make_unique(&InstrInfo)); } const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h @@ -660,6 +660,14 @@ return !RI.isSGPRReg(MRI, Dest); } + bool hasVGPRUses(const MachineInstr &MI) const { + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + return llvm::any_of(MI.explicit_uses(), + [&MRI, this](const MachineOperand &MO) { + return MO.isReg() && RI.isVGPR(MRI, MO.getReg());}); + } + /// Whether we must prevent this instruction from executing with EXEC = 0. bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const; Index: llvm/trunk/lib/Target/AMDGPU/SISchedule.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SISchedule.td +++ llvm/trunk/lib/Target/AMDGPU/SISchedule.td @@ -24,6 +24,9 @@ def WriteVMEM : SchedWrite; def WriteBarrier : SchedWrite; +def MIVGPRRead : SchedRead; +def MIMFMARead : SchedRead; + // Vector ALU instructions def Write32Bit : SchedWrite; def WriteQuarterRate32 : SchedWrite; @@ -43,6 +46,11 @@ // Half rate 64-bit instructions. def Write64Bit : SchedWrite; +// mAI multipass instructions. +def Write2PassMAI : SchedWrite; +def Write8PassMAI : SchedWrite; +def Write16PassMAI : SchedWrite; + // FIXME: Should there be a class for instructions which are VALU // instructions and have VALU rates, but write to the SALU (i.e. VOPC // instructions) @@ -97,6 +105,11 @@ class HWVALUWriteRes : HWWriteRes; +def PredMIReadVGPR : SchedPredicate<[{TII->hasVGPRUses(*MI)}]>; + +def MIReadVGPR : SchedReadVariant<[ + SchedVar, + SchedVar]>; // The latency numbers are taken from AMD Accelerated Parallel Processing // guide. They may not be accurate. @@ -115,6 +128,24 @@ def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; + def : HWVALUWriteRes; + def : HWVALUWriteRes; + def : HWVALUWriteRes; + + def : ReadAdvance; + def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32$")>; + + // Technicaly mfma reads can be from 0 to 4 cycles but that does not make + // sense to model because its register setup is huge. In particular if we + // properly model read advanice as -2 for a vgpr read it will result in a + // bad scheduling of acc writes before that mfma. To avoid it we would + // need to consume 2 or 4 more vgprs to be initialized before the acc + // write sequence. Just assume worst case here. + def : ReadAdvance; + + def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_..._4X4X")>; + def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_..._16X16X")>; + def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_..._32X32X")>; } def PredIsVGPR32Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) <= 32}]>;