Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -22,6 +22,7 @@ #include "SIInstrInfo.h" #include "SIISelLowering.h" #include "SIFrameLowering.h" +#include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/Triple.h" #include "llvm/CodeGen/GlobalISel/GISelAccessor.h" @@ -317,6 +318,11 @@ /// the given LDS memory size is the only constraint. unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const; + unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const { + const auto *MFI = MF.getInfo(); + return getOccupancyWithLocalMemSize(MFI->getLDSSize(), *MF.getFunction()); + } + bool hasFP16Denormals() const { return FP64FP16Denormals; } Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -23,6 +23,7 @@ #endif #include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" +#include "GCNIterativeScheduler.h" #include "GCNSchedStrategy.h" #include "R600MachineScheduler.h" #include "SIMachineScheduler.h" @@ -142,6 +143,20 @@ return DAG; } +static ScheduleDAGInstrs * +createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { + auto DAG = new GCNIterativeScheduler(C, + GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY); + DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); + return DAG; +} + +static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) { + return new GCNIterativeScheduler(C, + GCNIterativeScheduler::SCHEDULE_MINREGFORCED); +} + static MachineSchedRegistry R600SchedRegistry("r600", "Run R600's custom scheduler", createR600MachineScheduler); @@ -155,6 +170,16 @@ "Run GCN scheduler to maximize occupancy", createGCNMaxOccupancyMachineScheduler); +static MachineSchedRegistry +IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental", + "Run GCN scheduler to maximize occupancy (experimental)", + createIterativeGCNMaxOccupancyMachineScheduler); + +static MachineSchedRegistry +GCNMinRegSchedRegistry("gcn-minreg", + "Run GCN iterative scheduler for minimal registry usage (experimental)", + createMinRegScheduler); + static StringRef computeDataLayout(const Triple &TT) { if (TT.getArch() == Triple::r600) { // 32-bit pointers. Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -92,6 +92,9 @@ SIShrinkInstructions.cpp SITypeRewriter.cpp SIWholeQuadMode.cpp + GCNIterativeScheduler.cpp + GCNMinRegStrategy.cpp + GCNRegPressure.cpp ${GLOBAL_ISEL_BUILD_FILES} ) Index: lib/Target/AMDGPU/GCNIterativeScheduler.h =================================================================== --- /dev/null +++ lib/Target/AMDGPU/GCNIterativeScheduler.h @@ -0,0 +1,120 @@ +//===--------- GCNIterativeScheduler.h - GCN Scheduler -*- C++ -*----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H +#define LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H + +#include "GCNRegPressure.h" + +#include "llvm/CodeGen/MachineScheduler.h" + +namespace llvm { + +class GCNIterativeScheduler : public ScheduleDAGMILive { + typedef ScheduleDAGMILive BaseClass; +public: + enum StrategyKind { + SCHEDULE_MINREGONLY, + SCHEDULE_MINREGFORCED, + SCHEDULE_LEGACYMAXOCCUPANCY + }; + + GCNIterativeScheduler(MachineSchedContext *C, + StrategyKind S); + + void schedule() override; + + void enterRegion(MachineBasicBlock *BB, + MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + unsigned RegionInstrs) override; + + void finalizeSchedule() override; + +protected: + + typedef ArrayRef ScheduleRef; + + struct TentativeSchedule { + std::vector Schedule; + GCNRegPressure MaxPressure; + }; + + struct Region { + // Fields except for BestSchedule are supposed to reflect current IR state + // `const` fields are to emphasize they shouldn't change for any schedule. + MachineBasicBlock::iterator Begin; + // End is either a boundary instruction or end of basic block + const MachineBasicBlock::iterator End; + const unsigned NumRegionInstrs; + GCNRegPressure MaxPressure; + + // best schedule for the region so far (not scheduled yet) + std::unique_ptr BestSchedule; + }; + + SpecificBumpPtrAllocator Alloc; + std::vector Regions; + + MachineSchedContext *Context; + const StrategyKind Strategy; + mutable GCNUpwardRPTracker UPTracker; + + class BuildDAG; + class OverrideLegacyStrategy; + + template + GCNRegPressure getSchedulePressure(const Region &R, + Range &&Schedule) const; + + GCNRegPressure getRegionPressure(MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End) const; + + GCNRegPressure getRegionPressure(const Region &R) const { + return getRegionPressure(R.Begin, R.End); + } + + void setBestSchedule(Region &R, + ScheduleRef Schedule, + const GCNRegPressure &MaxRP = GCNRegPressure()); + + void scheduleBest(Region &R); + + std::vector detachSchedule(ScheduleRef Schedule) const; + + void sortRegionsByPressure(unsigned TargetOcc); + + template + void scheduleRegion(Region &R, Range &&Schedule, + const GCNRegPressure &MaxRP = GCNRegPressure()); + + unsigned tryMaximizeOccupancy(unsigned TargetOcc = + std::numeric_limits::max()); + + void scheduleLegacyMaxOccupancy(bool TryMaximizeOccupancy = true); + void scheduleMinReg(bool force = false); + +#ifndef NDEBUG + void printRegions(raw_ostream &OS) const; + void printSchedResult(raw_ostream &OS, + const Region *R, + const GCNRegPressure &RP) const; + void printSchedRP(raw_ostream &OS, + const GCNRegPressure &Before, + const GCNRegPressure &After) const; +#endif +}; + +} // End namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H Index: lib/Target/AMDGPU/GCNIterativeScheduler.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -0,0 +1,526 @@ +//===--------------------- GCNIterativeScheduler.cpp - --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#include "GCNIterativeScheduler.h" +#include "GCNSchedStrategy.h" +#include "SIMachineFunctionInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "misched" + +namespace llvm { + std::vector makeMinRegSchedule(ArrayRef TopRoots, + const ScheduleDAG &DAG); +} + +// shim accessors for different order containers +static inline MachineInstr *getMachineInstr(MachineInstr *MI) { + return MI; +} +static inline MachineInstr *getMachineInstr(const SUnit *SU) { + return SU->getInstr(); +} +static inline MachineInstr *getMachineInstr(const SUnit &SU) { + return SU.getInstr(); +} + +#ifndef NDEBUG +static void printRegion(raw_ostream &OS, + MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + const LiveIntervals *LIS, + unsigned MaxInstNum = + std::numeric_limits::max()) { + auto BB = Begin->getParent(); + OS << BB->getParent()->getName() << ":BB#" << BB->getNumber() + << ' ' << BB->getName() << ":\n"; + auto I = Begin; + MaxInstNum = std::max(MaxInstNum, 1u); + for (; I != End && MaxInstNum; ++I, --MaxInstNum) { + if (!I->isDebugValue() && LIS) + OS << LIS->getInstructionIndex(*I); + OS << '\t' << *I; + } + if (I != End) { + OS << "\t...\n"; + I = std::prev(End); + if (!I->isDebugValue() && LIS) + OS << LIS->getInstructionIndex(*I); + OS << '\t' << *I; + } + if (End != BB->end()) { // print boundary inst if present + OS << "----\n"; + if (LIS) OS << LIS->getInstructionIndex(*End) << '\t'; + OS << *End; + } +} + +static void printLivenessInfo(raw_ostream &OS, + MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + const LiveIntervals *LIS) { + const auto BB = Begin->getParent(); + const auto &MRI = BB->getParent()->getRegInfo(); + + const auto LiveIns = getLiveRegsBefore(*Begin, *LIS); + OS << "LIn RP: "; + getRegPressure(MRI, LiveIns).print(OS); + + const auto BottomMI = End == BB->end() ? std::prev(End) : End; + const auto LiveOuts = getLiveRegsAfter(*BottomMI, *LIS); + OS << "LOt RP: "; + getRegPressure(MRI, LiveOuts).print(OS); +} + +void GCNIterativeScheduler::printRegions(raw_ostream &OS) const { + const auto &ST = MF.getSubtarget(); + for (const auto R : Regions) { + OS << "Region to schedule "; + printRegion(OS, R->Begin, R->End, LIS, 1); + printLivenessInfo(OS, R->Begin, R->End, LIS); + OS << "Max RP: "; + R->MaxPressure.print(OS, &ST); + } +} + +void GCNIterativeScheduler::printSchedResult(raw_ostream &OS, + const Region *R, + const GCNRegPressure &RP) const { + OS << "\nAfter scheduling "; + printRegion(OS, R->Begin, R->End, LIS); + printSchedRP(OS, R->MaxPressure, RP); + OS << '\n'; +} + +void GCNIterativeScheduler::printSchedRP(raw_ostream &OS, + const GCNRegPressure &Before, + const GCNRegPressure &After) const { + const auto &ST = MF.getSubtarget(); + OS << "RP before: "; + Before.print(OS, &ST); + OS << "RP after: "; + After.print(OS, &ST); +} + +#endif + +// DAG builder helper +class GCNIterativeScheduler::BuildDAG { + GCNIterativeScheduler &Sch; + SmallVector TopRoots; +public: + BuildDAG(const Region &R, GCNIterativeScheduler &_Sch) + : Sch(_Sch) { + auto BB = R.Begin->getParent(); + Sch.BaseClass::startBlock(BB); + Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs); + + //Sch.buildSchedGraph(Sch.AA, nullptr, &Sch.SUPressureDiffs); + //Sch.initRPTracker(Sch.TopRPTracker, R.Begin); + + Sch.buildSchedGraph(Sch.AA, nullptr, nullptr, nullptr, + /*TrackLaneMask*/true); + Sch.Topo.InitDAGTopologicalSorting(); + + SmallVector BotRoots; + Sch.findRootsAndBiasEdges(TopRoots, BotRoots); + } + ~BuildDAG() { + Sch.BaseClass::exitRegion(); + Sch.BaseClass::finishBlock(); + } + ArrayRef getTopRoots() const { + return TopRoots; + } +}; + +class GCNIterativeScheduler::OverrideLegacyStrategy { + GCNIterativeScheduler &Sch; + Region &Rgn; + std::unique_ptr SaveSchedImpl; + GCNRegPressure SaveMaxRP; +public: + OverrideLegacyStrategy(Region &R, + MachineSchedStrategy &OverrideStrategy, + GCNIterativeScheduler &_Sch) + : Sch(_Sch) + , Rgn(R) + , SaveSchedImpl(std::move(_Sch.SchedImpl)) + , SaveMaxRP(R.MaxPressure) { + Sch.SchedImpl.reset(&OverrideStrategy); + auto BB = R.Begin->getParent(); + Sch.BaseClass::startBlock(BB); + Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs); + } + ~OverrideLegacyStrategy() { + Sch.BaseClass::exitRegion(); + Sch.BaseClass::finishBlock(); + Sch.SchedImpl.release(); + Sch.SchedImpl = std::move(SaveSchedImpl); + } + void schedule() { + assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End); + DEBUG(dbgs() << "\nScheduling "; + printRegion(dbgs(), Rgn.Begin, Rgn.End, Sch.LIS, 2)); + Sch.BaseClass::schedule(); + + // Unfortunatelly placeDebugValues incorrectly modifies RegionEnd, restore + Sch.RegionEnd = Rgn.End; + //assert(Rgn.End == Sch.RegionEnd); + Rgn.Begin = Sch.RegionBegin; + Rgn.MaxPressure.clear(); + } + void restoreOrder() { + assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End); + // DAG SUnits are stored using original region's order + // so just use SUnits as the restoring schedule + Sch.scheduleRegion(Rgn, Sch.SUnits, SaveMaxRP); + } +}; + +// just a stub to make base class happy +class SchedStrategyStub : public MachineSchedStrategy { +public: + bool shouldTrackPressure() const override { return false; } + bool shouldTrackLaneMasks() const override { return false; } + void initialize(ScheduleDAGMI *DAG) override {} + SUnit *pickNode(bool &IsTopNode) override { return nullptr; } + void schedNode(SUnit *SU, bool IsTopNode) override {} + void releaseTopNode(SUnit *SU) override {} + void releaseBottomNode(SUnit *SU) override {} +}; + +GCNIterativeScheduler::GCNIterativeScheduler(MachineSchedContext *C, + StrategyKind S) + : BaseClass(C, llvm::make_unique()) + , Context(C) + , Strategy(S) + , UPTracker(*LIS) { +} + +// returns max pressure for a region +GCNRegPressure +GCNIterativeScheduler::getRegionPressure(MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End) + const { + // For the purpose of pressure tracking bottom inst of the region should + // be also processed. End is either BB end, BB terminator inst or sched + // boundary inst. + auto const BBEnd = Begin->getParent()->end(); + auto const BottomMI = End == BBEnd ? --End : End; + + // scheduleRegions walks bottom to top, so its likely we just get next + // instruction to track + auto AfterBottomMI = std::next(BottomMI); + if (AfterBottomMI == BBEnd || + &*AfterBottomMI != UPTracker.getLastTrackedMI()) { + UPTracker.reset(*BottomMI); + } else { + assert(UPTracker.isValid()); + } + + for (auto I = BottomMI; I != Begin; --I) + UPTracker.recede(*I); + + UPTracker.recede(*Begin); + + assert(UPTracker.isValid() || + (dbgs() << "Tracked region ", + printRegion(dbgs(), Begin, End, LIS), false)); + return UPTracker.moveMaxPressure(); +} + +// returns max pressure for a tentative schedule +template GCNRegPressure +GCNIterativeScheduler::getSchedulePressure(const Region &R, + Range &&Schedule) const { + auto const BBEnd = R.Begin->getParent()->end(); + GCNUpwardRPTracker RPTracker(*LIS); + if (R.End != BBEnd) { + // R.End points to the boundary instruction but the + // schedule doesn't include it + RPTracker.reset(*R.End); + RPTracker.recede(*R.End); + } else { + // R.End doesn't point to the boundary instruction + RPTracker.reset(*std::prev(BBEnd)); + } + for (auto I = Schedule.end(), B = Schedule.begin(); I != B;) { + RPTracker.recede(*getMachineInstr(*--I)); + } + return RPTracker.moveMaxPressure(); +} + +void GCNIterativeScheduler::enterRegion(MachineBasicBlock *BB, // overriden + MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + unsigned NumRegionInstrs) { + BaseClass::enterRegion(BB, Begin, End, NumRegionInstrs); + if (NumRegionInstrs > 2) { + Regions.push_back( + new (Alloc.Allocate()) + Region { Begin, End, NumRegionInstrs, getRegionPressure(Begin, End) }); + } +} + +void GCNIterativeScheduler::schedule() { // overriden + // do nothing + DEBUG( + printLivenessInfo(dbgs(), RegionBegin, RegionEnd, LIS); + if (!Regions.empty() && Regions.back()->Begin == RegionBegin) { + dbgs() << "Max RP: "; + Regions.back()->MaxPressure.print(dbgs(), &MF.getSubtarget()); + } + dbgs() << '\n'; + ); +} + +void GCNIterativeScheduler::finalizeSchedule() { // overriden + if (Regions.empty()) + return; + switch (Strategy) { + case SCHEDULE_MINREGONLY: scheduleMinReg(); break; + case SCHEDULE_MINREGFORCED: scheduleMinReg(true); break; + case SCHEDULE_LEGACYMAXOCCUPANCY: scheduleLegacyMaxOccupancy(); break; + default: assert(false); + } +} + +// Detach schedule from SUnits and interleave it with debug values. +// Returned schedule becomes independent of DAG state. +std::vector +GCNIterativeScheduler::detachSchedule(ScheduleRef Schedule) const { + std::vector Res; + Res.reserve(Schedule.size() * 2); + + if (FirstDbgValue) + Res.push_back(FirstDbgValue); + + const auto DbgB = DbgValues.begin(), DbgE = DbgValues.end(); + for (auto SU : Schedule) { + Res.push_back(SU->getInstr()); + const auto &D = std::find_if(DbgB, DbgE, [SU](decltype(*DbgB) &P) { + return P.second == SU->getInstr(); + }); + if (D != DbgE) + Res.push_back(D->first); + } + return Res; +} + +void GCNIterativeScheduler::setBestSchedule(Region &R, + ScheduleRef Schedule, + const GCNRegPressure &MaxRP) { + R.BestSchedule.reset( + new TentativeSchedule{ detachSchedule(Schedule), MaxRP }); +} + +void GCNIterativeScheduler::scheduleBest(Region &R) { + assert(R.BestSchedule.get() && "No schedule specified"); + scheduleRegion(R, R.BestSchedule->Schedule, R.BestSchedule->MaxPressure); + R.BestSchedule.reset(); +} + +// minimal required region scheduler, works for ranges of SUnits*, +// SUnits or MachineIntrs* +template +void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule, + const GCNRegPressure &MaxRP) { + assert(RegionBegin == R.Begin && RegionEnd == R.End); + assert(LIS != nullptr); +#ifndef NDEBUG + const auto SchedMaxRP = getSchedulePressure(R, Schedule); +#endif + auto BB = R.Begin->getParent(); + auto Top = R.Begin; + for (const auto &I : Schedule) { + auto MI = getMachineInstr(I); + if (MI != &*Top) { + BB->remove(MI); + BB->insert(Top, MI); + if (!MI->isDebugValue()) + LIS->handleMove(*MI, true); + } + if (!MI->isDebugValue()) { + // Reset read - undef flags and update them later. + for (auto &Op : MI->operands()) + if (Op.isReg() && Op.isDef()) + Op.setIsUndef(false); + + RegisterOperands RegOpers; + RegOpers.collect(*MI, *TRI, MRI, /*ShouldTrackLaneMasks*/true, + /*IgnoreDead*/false); + // Adjust liveness and add missing dead+read-undef flags. + auto SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot(); + RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI); + } + Top = std::next(MI->getIterator()); + } + RegionBegin = getMachineInstr(Schedule.front()); + + // Schedule consisting of MachineInstr* is considered 'detached' + // and already interleaved with debug values + if (!std::is_same::value) { + placeDebugValues(); + // Unfortunatelly placeDebugValues incorrectly modifies RegionEnd, restore + //assert(R.End == RegionEnd); + RegionEnd = R.End; + } + + R.Begin = RegionBegin; + R.MaxPressure = MaxRP; + +#ifndef NDEBUG + const auto RegionMaxRP = getRegionPressure(R); + const auto &ST = MF.getSubtarget(); +#endif + assert((SchedMaxRP == RegionMaxRP && (MaxRP.empty() || SchedMaxRP == MaxRP)) + || (dbgs() << "Max RP mismatch!!!\n" + "RP for schedule (calculated): ", + SchedMaxRP.print(dbgs(), &ST), + dbgs() << "RP for schedule (reported): ", + MaxRP.print(dbgs(), &ST), + dbgs() << "RP after scheduling: ", + RegionMaxRP.print(dbgs(), &ST), + false)); +} + +// Sort recorded regions by pressure - highest at the front +void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) { + const auto &ST = MF.getSubtarget(); + std::sort(Regions.begin(), Regions.end(), + [&ST, TargetOcc](const Region *R1, const Region *R2) { + return R2->MaxPressure.less(ST, R1->MaxPressure, TargetOcc); + }); +} + +/////////////////////////////////////////////////////////////////////////////// +// Legacy MaxOccupancy Strategy + +// Tries to increase occupancy applying minreg scheduler for a sequence of +// most demanding regions. Obtained schedules are saved as BestSchedule for a +// region. +// TargetOcc is the best achievable occupancy for a kernel. +// Returns better occupancy on success or current occupancy on fail. +// BestSchedules aren't deleted on fail. +unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) { + // TODO: assert Regions are sorted descending by pressure + const auto &ST = MF.getSubtarget(); + const auto Occ = Regions.front()->MaxPressure.getOccupancy(ST); + DEBUG(dbgs() << "Trying to to improve occupancy, target = " << TargetOcc + << ", current = " << Occ << '\n'); + + auto NewOcc = TargetOcc; + for (auto R : Regions) { + if (R->MaxPressure.getOccupancy(ST) >= NewOcc) + break; + + DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3); + printLivenessInfo(dbgs(), R->Begin, R->End, LIS)); + + BuildDAG DAG(*R, *this); + const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this); + const auto MaxRP = getSchedulePressure(*R, MinSchedule); + DEBUG(dbgs() << "Occupancy improvement attempt:\n"; + printSchedRP(dbgs(), R->MaxPressure, MaxRP)); + + NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST)); + if (NewOcc <= Occ) + break; + + setBestSchedule(*R, MinSchedule, MaxRP); + } + DEBUG(dbgs() << "New occupancy = " << NewOcc + << ", prev occupancy = " << Occ << '\n'); + return std::max(NewOcc, Occ); +} + +void GCNIterativeScheduler::scheduleLegacyMaxOccupancy( + bool TryMaximizeOccupancy) { + const auto &ST = MF.getSubtarget(); + auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF); + + sortRegionsByPressure(TgtOcc); + auto Occ = Regions.front()->MaxPressure.getOccupancy(ST); + + if (TryMaximizeOccupancy && Occ < TgtOcc) + Occ = tryMaximizeOccupancy(TgtOcc); + + // This is really weird but for some magic scheduling regions twice + // gives performance improvement + const int NumPasses = Occ < TgtOcc ? 2 : 1; + + TgtOcc = std::min(Occ, TgtOcc); + DEBUG(dbgs() << "Scheduling using default scheduler, " + "target occupancy = " << TgtOcc << '\n'); + GCNMaxOccupancySchedStrategy LStrgy(Context); + + for (int I = 0; I < NumPasses; ++I) { + // running first pass with TargetOccupancy = 0 mimics previous scheduling + // approach and is a performance magic + LStrgy.setTargetOccupancy(I == 0 ? 0 : TgtOcc); + for (auto R : Regions) { + OverrideLegacyStrategy Ovr(*R, LStrgy, *this); + + Ovr.schedule(); + const auto RP = getRegionPressure(*R); + DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP)); + + if (RP.getOccupancy(ST) < TgtOcc) { + DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc); + if (R->BestSchedule.get() && + R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) { + DEBUG(dbgs() << ", scheduling minimal registry\n"); + scheduleBest(*R); + } else { + DEBUG(dbgs() << ", restoring\n"); + Ovr.restoreOrder(); + assert(R->MaxPressure.getOccupancy(ST) >= TgtOcc); + } + } + } + } +} + +/////////////////////////////////////////////////////////////////////////////// +// Minimal Registry Strategy + +void GCNIterativeScheduler::scheduleMinReg(bool force) { + const auto &ST = MF.getSubtarget(); + const auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF); + sortRegionsByPressure(TgtOcc); + + auto MaxPressure = Regions.front()->MaxPressure; + for (auto R : Regions) { + if (!force && R->MaxPressure.less(ST, MaxPressure, TgtOcc)) + break; + + BuildDAG DAG(*R, *this); + const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this); + + const auto RP = getSchedulePressure(*R, MinSchedule); + DEBUG(if (R->MaxPressure.less(ST, RP, TgtOcc)) { + dbgs() << "\nWarning: Pressure becomes worse after minreg!"; + printSchedRP(dbgs(), R->MaxPressure, RP); + }); + + if (!force && MaxPressure.less(ST, RP, TgtOcc)) + break; + + scheduleRegion(*R, MinSchedule, RP); + DEBUG(printSchedResult(dbgs(), R, RP)); + + MaxPressure = RP; + } +} Index: lib/Target/AMDGPU/GCNMinRegStrategy.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/GCNMinRegStrategy.cpp @@ -0,0 +1,267 @@ +//===----------------------- GCNMinRegStrategy.cpp - ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/ScheduleDAG.h" + +using namespace llvm; + +#define DEBUG_TYPE "misched" + +class GCNMinRegScheduler { + struct Candidate : ilist_node { + const SUnit *SU; + int Priority; + + Candidate(const SUnit *SU_, int Priority_ = 0) + : SU(SU_), Priority(Priority_) {} + }; + + SpecificBumpPtrAllocator Alloc; + typedef simple_ilist Queue; + Queue RQ; // Ready queue + + std::vector NumPreds; + + bool isScheduled(const SUnit *SU) const { + assert(!SU->isBoundaryNode()); + return NumPreds[SU->NodeNum] == std::numeric_limits::max(); + } + + void setIsScheduled(const SUnit *SU) { + assert(!SU->isBoundaryNode()); + NumPreds[SU->NodeNum] = std::numeric_limits::max(); + } + + unsigned getNumPreds(const SUnit *SU) const { + assert(!SU->isBoundaryNode()); + assert(NumPreds[SU->NodeNum] != std::numeric_limits::max()); + return NumPreds[SU->NodeNum]; + } + + unsigned decNumPreds(const SUnit *SU) { + assert(!SU->isBoundaryNode()); + assert(NumPreds[SU->NodeNum] != std::numeric_limits::max()); + return --NumPreds[SU->NodeNum]; + } + + void initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits); + + int getReadySuccessors(const SUnit *SU) const; + int getNotReadySuccessors(const SUnit *SU) const; + + template + unsigned findMax(unsigned Num, Calc C); + + Candidate* pickCandidate(); + + void bumpPredsPriority(const SUnit *SchedSU, int Priority); + void releaseSuccessors(const SUnit* SU, int Priority); + +public: + std::vector schedule(ArrayRef TopRoots, + const ScheduleDAG &DAG); +}; + +void GCNMinRegScheduler::initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits) { + NumPreds.resize(SUnits.size()); + for (unsigned I = 0; I < SUnits.size(); ++I) + NumPreds[I] = SUnits[I].NumPredsLeft; +} + +int GCNMinRegScheduler::getReadySuccessors(const SUnit *SU) const { + unsigned NumSchedSuccs = 0; + for (auto SDep : SU->Succs) { + bool wouldBeScheduled = true; + for (auto PDep : SDep.getSUnit()->Preds) { + auto PSU = PDep.getSUnit(); + assert(!PSU->isBoundaryNode()); + if (PSU != SU && !isScheduled(PSU)) { + wouldBeScheduled = false; + break; + } + } + NumSchedSuccs += wouldBeScheduled ? 1 : 0; + } + return NumSchedSuccs; +} + +int GCNMinRegScheduler::getNotReadySuccessors(const SUnit *SU) const { + return SU->Succs.size() - getReadySuccessors(SU); +} + +template +unsigned GCNMinRegScheduler::findMax(unsigned Num, Calc C) { + assert(!RQ.empty() && Num <= RQ.size()); + typedef decltype(C(*RQ.begin())) T; + T Max = std::numeric_limits::min(); + unsigned NumMax = 0; + for (auto I = RQ.begin(); Num; --Num) { + T Cur = C(*I); + if (Cur >= Max) { + if (Cur > Max) { + Max = Cur; + NumMax = 1; + } else + ++NumMax; + auto &Cand = *I++; + RQ.remove(Cand); + RQ.push_front(Cand); + continue; + } + ++I; + } + return NumMax; +} + +GCNMinRegScheduler::Candidate* GCNMinRegScheduler::pickCandidate() { + do { + unsigned Num = RQ.size(); + if (Num == 1) break; + + DEBUG(dbgs() << "\nSelecting max priority candidates among " << Num << '\n'); + Num = findMax(Num, [=](const Candidate &C) { return C.Priority; }); + if (Num == 1) break; + + DEBUG(dbgs() << "\nSelecting min non-ready producing candidate among " + << Num << '\n'); + Num = findMax(Num, [=](const Candidate &C) { + auto SU = C.SU; + int Res = getNotReadySuccessors(SU); + DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would left non-ready " + << Res << " successors, metric = " << -Res << '\n'); + return -Res; + }); + if (Num == 1) break; + + DEBUG(dbgs() << "\nSelecting most producing candidate among " + << Num << '\n'); + Num = findMax(Num, [=](const Candidate &C) { + auto SU = C.SU; + auto Res = getReadySuccessors(SU); + DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would make ready " + << Res << " successors, metric = " << Res << '\n'); + return Res; + }); + if (Num == 1) break; + + Num = Num ? Num : RQ.size(); + DEBUG(dbgs() << "\nCan't find best candidate, selecting in program order among " + << Num << '\n'); + Num = findMax(Num, [=](const Candidate &C) { return -(int64_t)C.SU->NodeNum; }); + assert(Num == 1); + } while (false); + + return &RQ.front(); +} + +void GCNMinRegScheduler::bumpPredsPriority(const SUnit *SchedSU, int Priority) { + SmallPtrSet Set; + for (const auto &S : SchedSU->Succs) { + if (S.getSUnit()->isBoundaryNode() || isScheduled(S.getSUnit()) || + S.getKind() != SDep::Data) + continue; + for (const auto &P : S.getSUnit()->Preds) { + auto PSU = P.getSUnit(); + assert(!PSU->isBoundaryNode()); + if (!isScheduled(PSU) && PSU != SchedSU) { + Set.insert(PSU); + } + } + } + SmallVector Worklist(Set.begin(), Set.end()); + while (!Worklist.empty()) { + auto SU = Worklist.back(); + Worklist.pop_back(); + assert(!SU->isBoundaryNode()); + for (const auto &P : SU->Preds) { + if (!P.getSUnit()->isBoundaryNode() && !isScheduled(P.getSUnit()) && + Set.insert(P.getSUnit()).second) + Worklist.push_back(P.getSUnit()); + } + } + DEBUG(dbgs() << "Make the predecessors of SU(" << SchedSU->NodeNum + << ")'s non-ready successors of " << Priority + << " priority in ready queue: "); + const auto SetEnd = Set.end(); + for (auto &C : RQ) { + if (Set.find(C.SU) != SetEnd) { + C.Priority = Priority; + DEBUG(dbgs() << " SU(" << C.SU->NodeNum << ')'); + } + } + DEBUG(dbgs() << '\n'); +} + +void GCNMinRegScheduler::releaseSuccessors(const SUnit* SU, int Priority) { + for (const auto &S : SU->Succs) { + auto SuccSU = S.getSUnit(); + if (S.isWeak()) + continue; + assert(SuccSU->isBoundaryNode() || getNumPreds(SuccSU) > 0); + if (!SuccSU->isBoundaryNode() && decNumPreds(SuccSU) == 0) + RQ.push_front(*new (Alloc.Allocate()) Candidate(SuccSU, Priority)); + } +} + +std::vector +GCNMinRegScheduler::schedule(ArrayRef TopRoots, + const ScheduleDAG &DAG) { + const auto &SUnits = DAG.SUnits; + std::vector Schedule; + Schedule.reserve(SUnits.size()); + + initNumPreds(SUnits); + + int StepNo = 0; + + for (auto SU : TopRoots) { + RQ.push_back(*new (Alloc.Allocate()) Candidate(SU, StepNo)); + } + releaseSuccessors(&DAG.EntrySU, StepNo); + + while (!RQ.empty()) { + DEBUG( + dbgs() << "\n=== Picking candidate, Step = " << StepNo << "\n" + "Ready queue:"; + for (auto &C : RQ) + dbgs() << ' ' << C.SU->NodeNum << "(P" << C.Priority << ')'; + dbgs() << '\n'; + ); + + auto C = pickCandidate(); + assert(C); + RQ.remove(*C); + auto SU = C->SU; + DEBUG(dbgs() << "Selected "; SU->dump(&DAG)); + + releaseSuccessors(SU, StepNo); + Schedule.push_back(SU); + setIsScheduled(SU); + + if (getReadySuccessors(SU) == 0) + bumpPredsPriority(SU, StepNo); + + ++StepNo; + } + assert(SUnits.size() == Schedule.size()); + + return Schedule; +} + +namespace llvm { +std::vector makeMinRegSchedule(ArrayRef TopRoots, + const ScheduleDAG &DAG) { + GCNMinRegScheduler S; + return S.schedule(TopRoots, DAG); +} +} \ No newline at end of file Index: lib/Target/AMDGPU/GCNRegPressure.h =================================================================== --- /dev/null +++ lib/Target/AMDGPU/GCNRegPressure.h @@ -0,0 +1,174 @@ +//===---------------------- GCNRegPressure.h -*- C++ -*--------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H +#define LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H + +#include "AMDGPUSubtarget.h" + +#include + +namespace llvm { + +struct GCNRegPressure { + enum RegKind { + SGPR32, + LARGESGPR, + VGPR32, + LARGEVGPR, + TOTAL_KINDS + }; + + GCNRegPressure() { + clear(); + } + + bool empty() const { return getSGRPNum() == 0 && getVGRPNum() == 0; } + + void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); } + + unsigned getSGRPNum() const { return Value[SGPR32]; } + unsigned getVGRPNum() const { return Value[VGPR32]; } + + unsigned getLargeVGPRsWeight() const { return Value[LARGEVGPR]; } + unsigned getLargeSGPRsWeight() const { return Value[LARGESGPR]; } + + unsigned getOccupancy(const SISubtarget &ST) const { + return std::min(ST.getOccupancyWithNumSGPRs(getSGRPNum()), + ST.getOccupancyWithNumVGPRs(getVGRPNum())); + } + + void inc(unsigned Reg, + LaneBitmask PrevMask, + LaneBitmask NewMask, + const MachineRegisterInfo &MRI); + + bool higherOccupancy(const SISubtarget &ST, const GCNRegPressure& O) const { + return getOccupancy(ST) > O.getOccupancy(ST); + } + + bool less(const SISubtarget &ST, const GCNRegPressure& O, + unsigned MaxOccupancy = std::numeric_limits::max()) const; + + bool operator==(const GCNRegPressure &O) const { + return std::equal(&Value[0], &Value[TOTAL_KINDS], O.Value); + } + + bool operator!=(const GCNRegPressure &O) const { + return !(*this == O); + } + + void print(raw_ostream &OS, const SISubtarget *ST=nullptr) const; + void dump() const { print(dbgs()); } + +private: + unsigned Value[TOTAL_KINDS]; + + unsigned getRegKind(unsigned Reg, const MachineRegisterInfo &MRI) const; + + friend GCNRegPressure max(const GCNRegPressure &P1, + const GCNRegPressure &P2); +}; + +inline GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2) { + GCNRegPressure Res; + for (unsigned I = 0; I < GCNRegPressure::TOTAL_KINDS; ++I) + Res.Value[I] = std::max(P1.Value[I], P2.Value[I]); + return Res; +} + +class GCNRPTracker { +public: + typedef DenseMap LiveRegSet; + +protected: + LiveRegSet LiveRegs; + GCNRegPressure CurPressure, MaxPressure; + const MachineInstr *LastTrackedMI = nullptr; + mutable const MachineRegisterInfo *MRI = nullptr; + GCNRPTracker() {} +public: + // live regs for the current state + const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; } + const MachineInstr *getLastTrackedMI() const { return LastTrackedMI; } + + // returns MaxPressure, resetting it + decltype(MaxPressure) moveMaxPressure() { + auto Res = MaxPressure; + MaxPressure.clear(); + return Res; + } + decltype(LiveRegs) moveLiveRegs() { + return std::move(LiveRegs); + } +}; + +class GCNUpwardRPTracker : public GCNRPTracker { + const LiveIntervals &LIS; + LaneBitmask getDefRegMask(const MachineOperand &MO) const; + LaneBitmask getUsedRegMask(const MachineOperand &MO) const; +public: + GCNUpwardRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {} + // reset tracker to the point just below MI + // filling live regs upon this point using LIS + void reset(const MachineInstr &MI); + + // move to the state just above the MI + void recede(const MachineInstr &MI); + +#ifndef NDEBUG + // checks whether the tracker's state after receding MI corresponds + // to reported by LIS + bool isValid() const; +#endif +}; + +LaneBitmask getLiveLaneMask(unsigned Reg, + SlotIndex SI, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI); + +GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI); + +inline GCNRPTracker::LiveRegSet getLiveRegsAfter(const MachineInstr &MI, + const LiveIntervals &LIS) { + return getLiveRegs(LIS.getInstructionIndex(MI).getDeadSlot(), LIS, + MI.getParent()->getParent()->getRegInfo()); +} + +inline GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI, + const LiveIntervals &LIS) { + return getLiveRegs(LIS.getInstructionIndex(MI).getBaseIndex(), LIS, + MI.getParent()->getParent()->getRegInfo()); +} + +template +GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI, + Range &&LiveRegs) { + GCNRegPressure Res; + for (const auto &RM : LiveRegs) + Res.inc(RM.first, LaneBitmask::getNone(), RM.second, MRI); + return Res; +} + +#ifndef NDEBUG +void printLivesAt(SlotIndex SI, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI); +#endif + +} // End namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H \ No newline at end of file Index: lib/Target/AMDGPU/GCNRegPressure.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/GCNRegPressure.cpp @@ -0,0 +1,374 @@ +//===------------------------- GCNRegPressure.cpp - -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#include "GCNRegPressure.h" + +using namespace llvm; + +#define DEBUG_TYPE "misched" + +#ifndef NDEBUG +void llvm::printLivesAt(SlotIndex SI, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI) { + dbgs() << "Live regs at " << SI << ": " + << *LIS.getInstructionFromIndex(SI); + unsigned Num = 0; + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + const unsigned Reg = TargetRegisterInfo::index2VirtReg(I); + if (MRI.reg_nodbg_empty(Reg)) + continue; + const auto &LI = LIS.getInterval(Reg); + if (LI.hasSubRanges()) { + bool firstTime = true; + for (const auto &S : LI.subranges()) { + if (!S.liveAt(SI)) continue; + if (firstTime) { + dbgs() << " " << PrintReg(Reg, MRI.getTargetRegisterInfo()) + << '\n'; + firstTime = false; + } + dbgs() << " " << S << '\n'; + ++Num; + } + } else if (LI.liveAt(SI)) { + dbgs() << " " << LI << '\n'; + ++Num; + } + } + if (!Num) dbgs() << " \n"; +} + +static bool isEqual(const GCNRPTracker::LiveRegSet &S1, + const GCNRPTracker::LiveRegSet &S2) { + if (S1.size() != S2.size()) + return false; + + for (const auto &P : S1) { + auto I = S2.find(P.first); + if (I == S2.end() || I->second != P.second) + return false; + } + return true; +} + +static GCNRPTracker::LiveRegSet +stripEmpty(const GCNRPTracker::LiveRegSet &LR) { + GCNRPTracker::LiveRegSet Res; + for (const auto &P : LR) { + if (P.second.any()) + Res.insert(P); + } + return Res; +} + +#endif + + +/////////////////////////////////////////////////////////////////////////////// +// GCNRegPressure + +unsigned GCNRegPressure::getRegKind(unsigned Reg, + const MachineRegisterInfo &MRI) const { + assert(TargetRegisterInfo::isVirtualRegister(Reg)); + const auto RC = MRI.getRegClass(Reg); + assert(RC); + switch (RC->getID()) { + case AMDGPU::VS_32RegClassID: + case AMDGPU::VGPR_32RegClassID: return VGPR32; + case AMDGPU::VS_64RegClassID: + case AMDGPU::VReg_64RegClassID: + case AMDGPU::VReg_96RegClassID: + case AMDGPU::VReg_128RegClassID: + case AMDGPU::VReg_256RegClassID: + case AMDGPU::VReg_512RegClassID: return LARGEVGPR; + case AMDGPU::SReg_32RegClassID: + case AMDGPU::SReg_32_XM0RegClassID: + case AMDGPU::SReg_32_XM0_XEXECRegClassID: + case AMDGPU::SGPR_32RegClassID: return SGPR32; + case AMDGPU::SReg_64RegClassID: + case AMDGPU::SReg_64_XEXECRegClassID: + case AMDGPU::SGPR_64RegClassID: + case AMDGPU::SReg_128RegClassID: + case AMDGPU::SGPR_128RegClassID: + case AMDGPU::SReg_256RegClassID: + case AMDGPU::SReg_512RegClassID: return LARGESGPR; + default: assert(false); + } + return -1; +} + +void GCNRegPressure::inc(unsigned Reg, + LaneBitmask PrevMask, + LaneBitmask NewMask, + const MachineRegisterInfo &MRI) { +#ifndef NDEBUG + const auto MaxMask = MRI.getMaxLaneMaskForVReg(Reg); +#endif + if (NewMask == PrevMask) + return; + + int Sign = 1; + if (NewMask < PrevMask) { + std::swap(NewMask, PrevMask); + Sign = -1; + } + switch (auto Kind = getRegKind(Reg, MRI)) { + case SGPR32: + case VGPR32: + assert(PrevMask.none() && NewMask == MaxMask); + Value[Kind] += Sign; + break; + + case LARGESGPR: + case LARGEVGPR: + assert(NewMask < MaxMask || NewMask == MaxMask); + assert(PrevMask < NewMask); + + Value[Kind == LARGESGPR ? SGPR32 : VGPR32] += + Sign * countPopulation((~PrevMask & NewMask).getAsInteger()); + + if (PrevMask.none()) { + assert(NewMask.any()); + Value[Kind] += Sign * MRI.getPressureSets(Reg).getWeight(); + } + break; + + default: assert(false); + } +} + +bool GCNRegPressure::less(const SISubtarget &ST, + const GCNRegPressure& O, + unsigned MaxOccupancy) const { + const auto SGPROcc = std::min(MaxOccupancy, + ST.getOccupancyWithNumSGPRs(getSGRPNum())); + const auto VGPROcc = std::min(MaxOccupancy, + ST.getOccupancyWithNumVGPRs(getVGRPNum())); + const auto OtherSGPROcc = std::min(MaxOccupancy, + ST.getOccupancyWithNumSGPRs(O.getSGRPNum())); + const auto OtherVGPROcc = std::min(MaxOccupancy, + ST.getOccupancyWithNumVGPRs(O.getVGRPNum())); + + const auto Occ = std::min(SGPROcc, VGPROcc); + const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc); + if (Occ != OtherOcc) + return Occ > OtherOcc; + + const bool SGPRImportant = SGPROcc < VGPROcc; + const bool OtherSGPRImportant = OtherSGPROcc < OtherVGPROcc; + + // if both pressures disagree on what is more important + // we cannot compare it directly + if (SGPRImportant != OtherSGPRImportant) { + return false; + } + + // compare large regs pressure + bool SGPRFirst = SGPRImportant; + for (int I = 2; I > 0; --I, SGPRFirst = !SGPRFirst) { + if (SGPRFirst) { + auto SW = getLargeSGPRsWeight(); + auto OtherSW = O.getLargeSGPRsWeight(); + if (SW != OtherSW) + return SW < OtherSW; + } else { + auto VW = getLargeVGPRsWeight(); + auto OtherVW = O.getLargeVGPRsWeight(); + if (VW != OtherVW) + return VW < OtherVW; + } + } + return SGPRImportant ? (getSGRPNum() < O.getSGRPNum()): + (getVGRPNum() < O.getVGRPNum()); +} + +void GCNRegPressure::print(raw_ostream &OS, const SISubtarget *ST) const { + OS << "VGPRs: " << getVGRPNum(); + if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGRPNum()) << ')'; + OS << ", SGPRs: " << getSGRPNum(); + if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGRPNum()) << ')'; + OS << ", LVGPR WT: " << getLargeVGPRsWeight() + << ", LSGPR WT: " << getLargeSGPRsWeight(); + if (ST) OS << " -> Occ: " << getOccupancy(*ST); + OS << '\n'; +} + +/////////////////////////////////////////////////////////////////////////////// +// GCNRPTracker + +LaneBitmask llvm::getLiveLaneMask(unsigned Reg, + SlotIndex SI, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI) { + assert(!MRI.reg_nodbg_empty(Reg)); + LaneBitmask LiveMask; + const auto &LI = LIS.getInterval(Reg); + if (LI.hasSubRanges()) { + for (const auto &S : LI.subranges()) + if (S.liveAt(SI)) { + LiveMask |= S.LaneMask; + assert(LiveMask < MRI.getMaxLaneMaskForVReg(Reg) || + LiveMask == MRI.getMaxLaneMaskForVReg(Reg)); + } + } else if (LI.liveAt(SI)) { + LiveMask = MRI.getMaxLaneMaskForVReg(Reg); + } + return LiveMask; +} + +GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI) { + GCNRPTracker::LiveRegSet LiveRegs; + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + auto Reg = TargetRegisterInfo::index2VirtReg(I); + if (MRI.reg_nodbg_empty(Reg)) + continue; + auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI); + if (LiveMask.any()) + LiveRegs[Reg] = LiveMask; + } + return LiveRegs; +} + +void GCNUpwardRPTracker::reset(const MachineInstr &MI) { + assert(&LIS && "Invalid LIS"); + MRI = &MI.getParent()->getParent()->getRegInfo(); + LiveRegs = getLiveRegsAfter(MI, LIS); + MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs); +} + +LaneBitmask GCNUpwardRPTracker::getDefRegMask(const MachineOperand &MO) const { + assert(MO.isDef() && MO.isReg() && + TargetRegisterInfo::isVirtualRegister(MO.getReg())); + + // We don't rely on read-undef flag because in case of tentative schedule + // tracking it isn't set correctly yet. This works correctly however since + // use mask has been tracked before using LIS. + return MO.getSubReg() == 0? + MRI->getMaxLaneMaskForVReg(MO.getReg()) : + MRI->getTargetRegisterInfo()->getSubRegIndexLaneMask(MO.getSubReg()); +} + +LaneBitmask GCNUpwardRPTracker::getUsedRegMask(const MachineOperand &MO) const { + assert(MO.isUse() && MO.isReg() && + TargetRegisterInfo::isVirtualRegister(MO.getReg())); + + if (auto SubReg = MO.getSubReg()) + return MRI->getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg); + + auto MaxMask = MRI->getMaxLaneMaskForVReg(MO.getReg()); + if (MaxMask.getAsInteger() == 1) // cannot have subregs + return MaxMask; + + // For a tentative schedule LIS isn't updated yet but livemask should remain + // the same on any schedule. Subreg defs can be reordered but they all must + // dominate uses anyway. + auto SI = LIS.getInstructionIndex(*MO.getParent()).getBaseIndex(); + return getLiveLaneMask(MO.getReg(), SI, LIS, *MRI); +} + +void GCNUpwardRPTracker::recede(const MachineInstr &MI) { + assert(&LIS && "Invalid LIS"); + assert(MRI && "call reset first"); + + LastTrackedMI = &MI; + + if (MI.isDebugValue()) + return; + + // process all defs first to ensure early clobbers are handled correctly + // iterating over operands() to catch implicit defs + for (const auto &MO : MI.operands()) { + if (!MO.isReg() || !MO.isDef() || + !TargetRegisterInfo::isVirtualRegister(MO.getReg())) + continue; + + auto Reg = MO.getReg(); + auto &LiveMask = LiveRegs[Reg]; + auto PrevMask = LiveMask; + LiveMask &= ~getDefRegMask(MO); + CurPressure.inc(Reg, PrevMask, LiveMask, *MRI); + } + + // then all uses + for (const auto &MO : MI.uses()) { + if (!MO.isReg() || !MO.readsReg() || + !TargetRegisterInfo::isVirtualRegister(MO.getReg())) + continue; + + auto Reg = MO.getReg(); + auto &LiveMask = LiveRegs[Reg]; + auto PrevMask = LiveMask; + LiveMask |= getUsedRegMask(MO); + CurPressure.inc(Reg, PrevMask, LiveMask, *MRI); + } + + MaxPressure = max(MaxPressure, CurPressure); +} + +#ifndef NDEBUG +static void reportMismatch(const GCNRPTracker::LiveRegSet &LISLR, + const GCNRPTracker::LiveRegSet &TrackedLR, + const TargetRegisterInfo *TRI) { + for (auto const &P : TrackedLR) { + auto I = LISLR.find(P.first); + if (I == LISLR.end()) { + dbgs() << " " << PrintReg(P.first, TRI) + << ":L" << PrintLaneMask(P.second) + << " isn't found in LIS reported set\n"; + } + else if (I->second != P.second) { + dbgs() << " " << PrintReg(P.first, TRI) + << " masks doesn't match: LIS reported " + << PrintLaneMask(I->second) + << ", tracked " + << PrintLaneMask(P.second) + << '\n'; + } + } + for (auto const &P : LISLR) { + auto I = TrackedLR.find(P.first); + if (I == TrackedLR.end()) { + dbgs() << " " << PrintReg(P.first, TRI) + << ":L" << PrintLaneMask(P.second) + << " isn't found in tracked set\n"; + } + } +} + +bool GCNUpwardRPTracker::isValid() const { + const auto &SI = LIS.getInstructionIndex(*LastTrackedMI).getBaseIndex(); + const auto LISLR = llvm::getLiveRegs(SI, LIS, *MRI); + const auto TrackedLR = stripEmpty(LiveRegs); + + if (!isEqual(LISLR, TrackedLR)) { + dbgs() << "\nGCNUpwardRPTracker error: Tracked and" + " LIS reported livesets mismatch:\n"; + printLivesAt(SI, LIS, *MRI); + reportMismatch(LISLR, TrackedLR, MRI->getTargetRegisterInfo()); + return false; + } + + auto LISPressure = getRegPressure(*MRI, LISLR); + if (LISPressure != CurPressure) { + dbgs() << "GCNUpwardRPTracker error: Pressure sets different\nTracked: "; + CurPressure.print(dbgs()); + dbgs() << "LIS rpt: "; + LISPressure.print(dbgs()); + return false; + } + return true; +} +#endif \ No newline at end of file Index: lib/Target/AMDGPU/GCNSchedStrategy.h =================================================================== --- lib/Target/AMDGPU/GCNSchedStrategy.h +++ lib/Target/AMDGPU/GCNSchedStrategy.h @@ -55,6 +55,8 @@ SUnit *pickNode(bool &IsTopNode) override; void initialize(ScheduleDAGMI *DAG) override; + + void setTargetOccupancy(unsigned Occ) { TargetOccupancy = Occ; } }; class GCNScheduleDAGMILive : public ScheduleDAGMILive { Index: lib/Target/AMDGPU/GCNSchedStrategy.cpp =================================================================== --- lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -45,8 +45,6 @@ const SIRegisterInfo *SRI = static_cast(TRI); - if (MF != &DAG->MF) - TargetOccupancy = 0; MF = &DAG->MF; const SISubtarget &ST = MF->getSubtarget(); @@ -346,12 +344,16 @@ PressureBefore = getRealRegPressure(); } + GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl; + if (Stage == 0) { + S.TargetOccupancy = 0; + } + ScheduleDAGMILive::schedule(); if (!LIS) return; // Check the results of scheduling. - GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl; DEBUG(dbgs() << "Pressure after scheduling:\n"); auto PressureAfter = getRealRegPressure(); LiveIns.clear();