Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -18,6 +18,7 @@ #include "AMDGPUCallLowering.h" #include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" +#include "GCNSchedStrategy.h" #include "R600ISelLowering.h" #include "R600InstrInfo.h" #include "R600MachineScheduler.h" @@ -95,6 +96,13 @@ return new SIScheduleDAGMI(C); } +static ScheduleDAGInstrs *createGCNMachineScheduler(MachineSchedContext *C) { + ScheduleDAGMILive *DAG = + new ScheduleDAGMILive(C, make_unique(C)); + DAG->addMutation(make_unique(DAG->TII, DAG->TRI)); + return DAG; +} + static MachineSchedRegistry R600SchedRegistry("r600", "Run R600's custom scheduler", createR600MachineScheduler); @@ -103,6 +111,10 @@ SISchedRegistry("si", "Run SI's custom scheduler", createSIMachineScheduler); +static MachineSchedRegistry +GCNSchedRegistry("gcn", "Run GCN custom scheduler", + createGCNMachineScheduler); + static StringRef computeDataLayout(const Triple &TT) { if (TT.getArch() == Triple::r600) { // 32-bit pointers. Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -49,6 +49,7 @@ AMDGPUPromoteAlloca.cpp AMDGPURegisterInfo.cpp GCNHazardRecognizer.cpp + GCNSchedStrategy.cpp R600ClauseMergePass.cpp R600ControlFlowFinalizer.cpp R600EmitClauseMarkers.cpp Index: lib/Target/AMDGPU/GCNSchedStrategy.h =================================================================== --- /dev/null +++ lib/Target/AMDGPU/GCNSchedStrategy.h @@ -0,0 +1,51 @@ +//===-- GCNSchedStrategy.h - GCN Scheduler Strategy -*- C++ -*-------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H +#define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H + +#include "llvm/CodeGen/MachineScheduler.h" + +namespace llvm { + +class SIRegisterInfo; + +class GCNSchedStrategy : public GenericScheduler { + + SUnit *pickNodeBidirectional(bool &IsTopNode); + + void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, + const RegPressureTracker &RPTracker, + SchedCandidate &Cand); + + void initCandidate(SchedCandidate &Cand, SUnit *SU, + bool AtTop, const RegPressureTracker &RPTracker, + RegPressureTracker &TempTracker, + const SIRegisterInfo *SRI, + int SGPRPressure, int VGPRPressure, + int SGPRExcessLimit, int VGPRExcessLimit, + int SGPRCriticalLimit, int VGPRCriticalLimit); + + void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone, const SIRegisterInfo *SRI, + unsigned SGPRPressure, unsigned VGPRPressure); + +public: + GCNSchedStrategy(const MachineSchedContext *C); + + SUnit *pickNode(bool &IsTopNode) override; +}; + +} // End namespace llvm + +#endif // GCNSCHEDSTRATEGY_H Index: lib/Target/AMDGPU/GCNSchedStrategy.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -0,0 +1,350 @@ +//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// This is a minimal scheduler strategy. The main difference between this +/// and the GenericScheduler is that GCNSchedStrategy uses different +/// heuristics to determine excess/critical pressure sets. +//===----------------------------------------------------------------------===// + +#include "GCNSchedStrategy.h" +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIRegisterInfo.h" +#include "llvm/Target/TargetInstrInfo.h" + +#define DEBUG_TYPE "misched" + +using namespace llvm; + +GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) : + GenericScheduler(C) { } + +static unsigned MaxWavesPerSGPRs(unsigned SGPRs, + const SISubtarget &ST) { + + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { + if (SGPRs <= 80) + return 10; + if (SGPRs <= 88) + return 9; + if (SGPRs <= 100) + return 8; + return 7; + } + if (SGPRs <= 48) + return 10; + if (SGPRs <= 56) + return 9; + if (SGPRs <= 64) + return 8; + if (SGPRs <= 72) + return 7; + if (SGPRs <= 80) + return 6; + return 5; +} + +static unsigned MaxWavesPerVGPRs(unsigned VGPRs) { + if (VGPRs <= 24) + return 10; + if (VGPRs <= 28) + return 9; + if (VGPRs <= 32) + return 8; + if (VGPRs <= 36) + return 7; + if (VGPRs <= 40) + return 6; + if (VGPRs <= 48) + return 5; + if (VGPRs <= 64) + return 4; + if (VGPRs <= 84) + return 3; + if (VGPRs <= 128) + return 2; + return 1; +} + +static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs, + const SISubtarget &ST) { + return std::min(MaxWavesPerSGPRs(SGPRs, ST), MaxWavesPerVGPRs(VGPRs)); +} + +void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, + bool AtTop, const RegPressureTracker &RPTracker, + RegPressureTracker &TempTracker, + const SIRegisterInfo *SRI, + int SGPRPressure, + int VGPRPressure, + int SGPRExcessLimit, + int VGPRExcessLimit, + int SGPRCriticalLimit, + int VGPRCriticalLimit) { + + const SISubtarget &ST = DAG->MF.getSubtarget(); + Cand.SU = SU; + Cand.AtTop = AtTop; + + // If two instructions increase the pressure of different register sets + // by the same amount, the generic scheduler will prefer to schedule the + // instruction that increases the set with the least amount of registers, + // which in our case would be SGPRs. This is rarely what we want, so + // when we report excess/critical register pressure, we do it either + // only for VGPRs or only for SGPRs. + + // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs. + const int MaxVGPRPressureInc = 16; + bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit; + bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit; + + std::vector Pressure; + std::vector MaxPressure; + + if (AtTop) + TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure); + else + // FIXME: I think for bottom up scheduling, the register pressure is cached + // and can be retrieved by DAG->getPressureDif(SU). + TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); + + int NewSGPRPressure = Pressure[SRI->getSGPRPressureSet()]; + int NewVGPRPressure = Pressure[SRI->getVGPRPressureSet()]; + + + // FIXME: We have to enter REG-EXCESS before we reach the actual threshold + // to increase the likelihood we don't go over the limits. We should improve + // the analysis to look through dependencies to find the path with the least + // register pressure. + // FIXME: This is also necessary, because some passes that run after + // scheduling and before regalloc increase register pressure. + const int ErrorMargin = 3; + VGPRExcessLimit -= ErrorMargin; + SGPRExcessLimit -= ErrorMargin; + + // We only need to update the RPDelata for instructions that increase + // register pressure. Instructions that decrease or keep reg pressure + // the same will be marked as RegExcess in tryCandidate() when they + // are compared with instructions that increase the register pressure. + if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) { + Cand.RPDelta.Excess = PressureChange(SRI->getVGPRPressureSet()); + Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit); + } + + if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) { + Cand.RPDelta.Excess = PressureChange(SRI->getSGPRPressureSet()); + Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure = SGPRExcessLimit); + } + + // Register pressure is considered 'CRITICAL' if it is approaching a value + // that would reduce the wave occupancy for the execution unit. When + // register pressure is 'CRITICAL', increading SGPR and VGPR pressure both + // has the same cost, so we don't need to prefer one over the other. + + unsigned MaxWaves = getMaxWaves(SGPRPressure, VGPRPressure, ST); + VGPRCriticalLimit = SRI->getNumVGPRsAllowed(MaxWaves); + SGPRCriticalLimit = SRI->getNumSGPRsAllowed(DAG->MF.getSubtarget(), MaxWaves); + + VGPRCriticalLimit -= ErrorMargin; + SGPRCriticalLimit -= ErrorMargin; + + int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit; + int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit; + + if (SGPRDelta >= 0 || VGPRDelta >= 0) { + if (SGPRDelta > VGPRDelta) { + Cand.RPDelta.CriticalMax = PressureChange(SRI->getSGPRPressureSet()); + Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta); + } else { + Cand.RPDelta.CriticalMax = PressureChange(SRI->getVGPRPressureSet()); + Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta); + } + } +} + +// This function is mostly cut and pasted from +// GenericScheduler::pickNodeFromQueue() +void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, + const CandPolicy &ZonePolicy, + const RegPressureTracker &RPTracker, + SchedCandidate &Cand) { + // initCandidate() will make changes to the RPTracker, so we need to pass it + // a non-const tracker. However, initCandidate() will undo any changes it + // makes, so TempTracker will be the same before and after the call to + // initCandidate(). + RegPressureTracker &TempTracker = const_cast(RPTracker); + + RegisterPressure Pressure = TempTracker.getPressure(); + const SIRegisterInfo *SRI = static_cast(TRI); + std::pair CurPressure = + SRI->getMaxRegPressure(TempTracker.getRegSetPressureAtPos()); + + std::pair ExcessLimits = SRI->getRegPressureSetLimits(DAG->MF); + + ReadyQueue &Q = Zone.Available; + for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) { + + SchedCandidate TryCand(ZonePolicy); + initCandidate(TryCand, *I, Zone.isTop(), RPTracker, TempTracker, SRI, + CurPressure.first, CurPressure.second, ExcessLimits.first, + ExcessLimits.second, 0, 0); + // Pass SchedBoundary only when comparing nodes from the same boundary. + SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; + //tryCandidate(Cand, TryCand, ZoneArg, SRI, MaxPressure.first, MaxPressure.second); + GenericScheduler::tryCandidate(Cand, TryCand, ZoneArg); + if (TryCand.Reason != NoCand) { + // Initialize resource delta if needed in case future heuristics query it. + if (TryCand.ResDelta == SchedResourceDelta()) + TryCand.initResourceDelta(Zone.DAG, SchedModel); + Cand.setBest(TryCand); + } + } +} + +static int getBidirectionalReasonRank(GenericSchedulerBase::CandReason Reason) { + switch (Reason) { + default: + return Reason; + case GenericSchedulerBase::RegCritical: + case GenericSchedulerBase::RegExcess: + return -1 * Reason; + } +} + +// This function is mostly cut and pasted from +// GenericScheduler::pickNodeBidirectional() +SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { + // Schedule as far as possible in the direction of no choice. This is most + // efficient, but also provides the best heuristics for CriticalPSets. + if (SUnit *SU = Bot.pickOnlyChoice()) { + IsTopNode = false; + return SU; + } + if (SUnit *SU = Top.pickOnlyChoice()) { + IsTopNode = true; + return SU; + } + // Set the bottom-up policy based on the state of the current bottom zone and + // the instructions outside the zone, including the top zone. + CandPolicy BotPolicy; + setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top); + // Set the top-down policy based on the state of the current top zone and + // the instructions outside the zone, including the bottom zone. + CandPolicy TopPolicy; + setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot); + + // See if BotCand is still valid (because we previously scheduled from Top). + DEBUG(dbgs() << "Picking from Bot:\n"); + if (!BotCand.isValid() || BotCand.SU->isScheduled || + BotCand.Policy != BotPolicy) { + BotCand.reset(CandPolicy()); + pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand); + assert(BotCand.Reason != NoCand && "failed to find the first candidate"); + } else { + DEBUG(traceCandidate(BotCand)); + } + + // Check if the top Q has a better candidate. + DEBUG(dbgs() << "Picking from Top:\n"); + if (!TopCand.isValid() || TopCand.SU->isScheduled || + TopCand.Policy != TopPolicy) { + TopCand.reset(CandPolicy()); + pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand); + assert(TopCand.Reason != NoCand && "failed to find the first candidate"); + } else { + DEBUG(traceCandidate(TopCand)); + } + + // Pick best from BotCand and TopCand. + DEBUG(dbgs() << "Top Cand: "); + DEBUG(traceCandidate(BotCand)); + DEBUG(dbgs() << "Bot Cand: "); + DEBUG(traceCandidate(TopCand)); + SchedCandidate Cand; + if (TopCand.Reason == BotCand.Reason) { + Cand = BotCand; + GenericSchedulerBase::CandReason TopReason = TopCand.Reason; + TopCand.Reason = NoCand; + GenericScheduler::tryCandidate(Cand, TopCand, nullptr); + if (TopCand.Reason != NoCand) { + Cand.setBest(TopCand); + } else { + TopCand.Reason = TopReason; + } + } else { + if (TopCand.Reason == RegExcess && TopCand.RPDelta.Excess.getUnitInc() <= 0) { + Cand = TopCand; + } else if (BotCand.Reason == RegExcess && BotCand.RPDelta.Excess.getUnitInc() <= 0) { + Cand = BotCand; + } else if (TopCand.Reason == RegCritical && TopCand.RPDelta.CriticalMax.getUnitInc() <= 0) { + Cand = TopCand; + } else if (BotCand.Reason == RegCritical && BotCand.RPDelta.CriticalMax.getUnitInc() <= 0) { + Cand = BotCand; + } else { + int TopRank = getBidirectionalReasonRank(TopCand.Reason); + int BotRank = getBidirectionalReasonRank(BotCand.Reason); + if (TopRank > BotRank) { + Cand = TopCand; + } else { + Cand = BotCand; + } + } + } + DEBUG(dbgs() << "Picking: "); + DEBUG(traceCandidate(Cand)); + + IsTopNode = Cand.AtTop; + return Cand.SU; +} + +// This function is mostly cut and pasted from +// GenericScheduler::pickNode() +SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { + if (DAG->top() == DAG->bottom()) { + assert(Top.Available.empty() && Top.Pending.empty() && + Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); + return nullptr; + } + SUnit *SU; + do { + if (RegionPolicy.OnlyTopDown) { + SU = Top.pickOnlyChoice(); + if (!SU) { + CandPolicy NoPolicy; + TopCand.reset(NoPolicy); + pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand); + assert(TopCand.Reason != NoCand && "failed to find a candidate"); + SU = TopCand.SU; + } + IsTopNode = true; + } else if (RegionPolicy.OnlyBottomUp) { + SU = Bot.pickOnlyChoice(); + if (!SU) { + CandPolicy NoPolicy; + BotCand.reset(NoPolicy); + pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand); + assert(BotCand.Reason != NoCand && "failed to find a candidate"); + SU = BotCand.SU; + } + IsTopNode = false; + } else { + SU = pickNodeBidirectional(IsTopNode); + } + } while (SU->isScheduled); + + if (SU->isTopReady()) + Top.removeReady(SU); + if (SU->isBottomReady()) + Bot.removeReady(SU); + + DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr()); + return SU; +} Index: lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.h +++ lib/Target/AMDGPU/SIRegisterInfo.h @@ -48,10 +48,6 @@ BitVector getReservedRegs(const MachineFunction &MF) const override; - unsigned getRegPressureSetLimit(const MachineFunction &MF, - unsigned Idx) const override; - - bool requiresRegisterScavenging(const MachineFunction &Fn) const override; Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -230,27 +230,6 @@ return Reserved; } -unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, - unsigned Idx) const { - const SISubtarget &STI = MF.getSubtarget(); - // FIXME: We should adjust the max number of waves based on LDS size. - unsigned SGPRLimit = getNumSGPRsAllowed(STI, STI.getMaxWavesPerCU()); - unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU()); - - unsigned VSLimit = SGPRLimit + VGPRLimit; - - if (SGPRPressureSets.test(Idx) && VGPRPressureSets.test(Idx)) { - // FIXME: This is a hack. We should never be considering the pressure of - // these since no virtual register should ever have this class. - return VSLimit; - } - - if (SGPRPressureSets.test(Idx)) - return SGPRLimit; - - return VGPRLimit; -} - bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { return Fn.getFrameInfo().hasStackObjects(); } Index: lib/Target/AMDGPU/SISchedule.td =================================================================== --- lib/Target/AMDGPU/SISchedule.td +++ lib/Target/AMDGPU/SISchedule.td @@ -47,6 +47,10 @@ class SISchedMachineModel : SchedMachineModel { let CompleteModel = 0; + // MicroOpBufferSize = 1 means that instructions will always be added + // the ready queue when they become available. This exposes them + // to the register pressure analysis. + let MicroOpBufferSize = 1; let IssueWidth = 1; let PostRAScheduler = 1; }