Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -219,6 +219,15 @@ GCNIterativeScheduler::SCHEDULE_MINREGFORCED); } +static ScheduleDAGInstrs * +createIterativeILPMachineScheduler(MachineSchedContext *C) { + auto DAG = new GCNIterativeScheduler(C, + GCNIterativeScheduler::SCHEDULE_ILP); + DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); + return DAG; +} + static MachineSchedRegistry R600SchedRegistry("r600", "Run R600's custom scheduler", createR600MachineScheduler); @@ -242,6 +251,11 @@ "Run GCN iterative scheduler for minimal register usage (experimental)", createMinRegScheduler); +static MachineSchedRegistry +GCNILPSchedRegistry("gcn-ilp", + "Run GCN iterative scheduler for ILP scheduling (experimental)", + createIterativeILPMachineScheduler); + static StringRef computeDataLayout(const Triple &TT) { if (TT.getArch() == Triple::r600) { // 32-bit pointers. Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -95,6 +95,7 @@ SIRegisterInfo.cpp SIShrinkInstructions.cpp SIWholeQuadMode.cpp + GCNILPSched.cpp ) add_subdirectory(AsmParser) Index: lib/Target/AMDGPU/GCNILPSched.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/GCNILPSched.cpp @@ -0,0 +1,364 @@ +//===---------------------------- GCNILPSched.cpp - -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/ScheduleDAG.h" + +using namespace llvm; + +#define DEBUG_TYPE "machine-scheduler" + +namespace { + +class GCNILPScheduler { + struct Candidate : ilist_node { + SUnit *SU; + + Candidate(SUnit *SU_) + : SU(SU_) {} + }; + + SpecificBumpPtrAllocator Alloc; + typedef simple_ilist Queue; + Queue PendingQueue; + Queue AvailQueue; + unsigned CurQueueId = 0; + + std::vector SUNumbers; + + /// CurCycle - The current scheduler state corresponds to this cycle. + unsigned CurCycle = 0; + + unsigned getNodePriority(const SUnit *SU) const; + + const SUnit *pickBest(const SUnit *left, const SUnit *right); + Candidate* pickCandidate(); + + void releasePending(); + void advanceToCycle(unsigned NextCycle); + void releasePredecessors(const SUnit* SU); + +public: + std::vector schedule(ArrayRef TopRoots, + const ScheduleDAG &DAG); +}; +} // namespace + +/// CalcNodeSethiUllmanNumber - Compute Sethi Ullman number. +/// Smaller number is the higher priority. +static unsigned +CalcNodeSethiUllmanNumber(const SUnit *SU, std::vector &SUNumbers) { + unsigned &SethiUllmanNumber = SUNumbers[SU->NodeNum]; + if (SethiUllmanNumber != 0) + return SethiUllmanNumber; + + unsigned Extra = 0; + for (const SDep &Pred : SU->Preds) { + if (Pred.isCtrl()) continue; // ignore chain preds + SUnit *PredSU = Pred.getSUnit(); + unsigned PredSethiUllman = CalcNodeSethiUllmanNumber(PredSU, SUNumbers); + if (PredSethiUllman > SethiUllmanNumber) { + SethiUllmanNumber = PredSethiUllman; + Extra = 0; + } + else if (PredSethiUllman == SethiUllmanNumber) + ++Extra; + } + + SethiUllmanNumber += Extra; + + if (SethiUllmanNumber == 0) + SethiUllmanNumber = 1; + + return SethiUllmanNumber; +} + +// Lower priority means schedule further down. For bottom-up scheduling, lower +// priority SUs are scheduled before higher priority SUs. +unsigned GCNILPScheduler::getNodePriority(const SUnit *SU) const { + assert(SU->NodeNum < SUNumbers.size()); + if (SU->NumSuccs == 0 && SU->NumPreds != 0) + // If SU does not have a register use, i.e. it doesn't produce a value + // that would be consumed (e.g. store), then it terminates a chain of + // computation. Give it a large SethiUllman number so it will be + // scheduled right before its predecessors that it doesn't lengthen + // their live ranges. + return 0xffff; + + if (SU->NumPreds == 0 && SU->NumSuccs != 0) + // If SU does not have a register def, schedule it close to its uses + // because it does not lengthen any live ranges. + return 0; + + return SUNumbers[SU->NodeNum]; +} + +/// closestSucc - Returns the scheduled cycle of the successor which is +/// closest to the current cycle. +static unsigned closestSucc(const SUnit *SU) { + unsigned MaxHeight = 0; + for (const SDep &Succ : SU->Succs) { + if (Succ.isCtrl()) continue; // ignore chain succs + unsigned Height = Succ.getSUnit()->getHeight(); + // If there are bunch of CopyToRegs stacked up, they should be considered + // to be at the same position. + if (Height > MaxHeight) + MaxHeight = Height; + } + return MaxHeight; +} + +/// calcMaxScratches - Returns an cost estimate of the worse case requirement +/// for scratch registers, i.e. number of data dependencies. +static unsigned calcMaxScratches(const SUnit *SU) { + unsigned Scratches = 0; + for (const SDep &Pred : SU->Preds) { + if (Pred.isCtrl()) continue; // ignore chain preds + Scratches++; + } + return Scratches; +} + +// Return -1 if left has higher priority, 1 if right has higher priority. +// Return 0 if latency-based priority is equivalent. +static int BUCompareLatency(const SUnit *left, const SUnit *right) { + // Scheduling an instruction that uses a VReg whose postincrement has not yet + // been scheduled will induce a copy. Model this as an extra cycle of latency. + int LHeight = (int)left->getHeight(); + int RHeight = (int)right->getHeight(); + + // If either node is scheduling for latency, sort them by height/depth + // and latency. + + // If neither instruction stalls (!LStall && !RStall) and HazardRecognizer + // is enabled, grouping instructions by cycle, then its height is already + // covered so only its depth matters. We also reach this point if both stall + // but have the same height. + if (LHeight != RHeight) + return LHeight > RHeight ? 1 : -1; + + int LDepth = left->getDepth(); + int RDepth = right->getDepth(); + if (LDepth != RDepth) { + DEBUG(dbgs() << " Comparing latency of SU (" << left->NodeNum + << ") depth " << LDepth << " vs SU (" << right->NodeNum + << ") depth " << RDepth << "\n"); + return LDepth < RDepth ? 1 : -1; + } + if (left->Latency != right->Latency) + return left->Latency > right->Latency ? 1 : -1; + + return 0; +} + +const SUnit *GCNILPScheduler::pickBest(const SUnit *left, const SUnit *right) +{ + // TODO: add register pressure lowering checks + + bool const DisableSchedCriticalPath = false; + int MaxReorderWindow = 6; + if (!DisableSchedCriticalPath) { + int spread = (int)left->getDepth() - (int)right->getDepth(); + if (std::abs(spread) > MaxReorderWindow) { + DEBUG(dbgs() << "Depth of SU(" << left->NodeNum << "): " + << left->getDepth() << " != SU(" << right->NodeNum << "): " + << right->getDepth() << "\n"); + return left->getDepth() < right->getDepth() ? right : left; + } + } + + bool const DisableSchedHeight = false; + if (!DisableSchedHeight && left->getHeight() != right->getHeight()) { + int spread = (int)left->getHeight() - (int)right->getHeight(); + if (std::abs(spread) > MaxReorderWindow) + return left->getHeight() > right->getHeight() ? right : left; + } + + // Prioritize by Sethi-Ulmann number and push CopyToReg nodes down. + unsigned LPriority = getNodePriority(left); + unsigned RPriority = getNodePriority(right); + + if (LPriority != RPriority) + return LPriority > RPriority ? right : left; + + // Try schedule def + use closer when Sethi-Ullman numbers are the same. + // e.g. + // t1 = op t2, c1 + // t3 = op t4, c2 + // + // and the following instructions are both ready. + // t2 = op c3 + // t4 = op c4 + // + // Then schedule t2 = op first. + // i.e. + // t4 = op c4 + // t2 = op c3 + // t1 = op t2, c1 + // t3 = op t4, c2 + // + // This creates more short live intervals. + unsigned LDist = closestSucc(left); + unsigned RDist = closestSucc(right); + if (LDist != RDist) + return LDist < RDist ? right : left; + + // How many registers becomes live when the node is scheduled. + unsigned LScratch = calcMaxScratches(left); + unsigned RScratch = calcMaxScratches(right); + if (LScratch != RScratch) + return LScratch > RScratch ? right : left; + + bool const DisableSchedCycles = false; + if (!DisableSchedCycles) { + int result = BUCompareLatency(left, right); + if (result != 0) + return result > 0 ? right : left; + return left; + } + else { + if (left->getHeight() != right->getHeight()) + return (left->getHeight() > right->getHeight()) ? right : left; + + if (left->getDepth() != right->getDepth()) + return (left->getDepth() < right->getDepth()) ? right : left; + } + + assert(left->NodeQueueId && right->NodeQueueId && + "NodeQueueId cannot be zero"); + return (left->NodeQueueId > right->NodeQueueId) ? right : left; +} + +GCNILPScheduler::Candidate* GCNILPScheduler::pickCandidate() { + if (AvailQueue.empty()) + return nullptr; + auto Best = AvailQueue.begin(); + for (auto I = std::next(AvailQueue.begin()), E = AvailQueue.end(); I != E; ++I) { + auto NewBestSU = pickBest(Best->SU, I->SU); + if (NewBestSU != Best->SU) { + assert(NewBestSU == I->SU); + Best = I; + } + } + return &*Best; +} + +void GCNILPScheduler::releasePending() { + // Check to see if any of the pending instructions are ready to issue. If + // so, add them to the available queue. + for(auto I = PendingQueue.begin(), E = PendingQueue.end(); I != E;) { + auto &C = *I++; + if (C.SU->getHeight() <= CurCycle) { + PendingQueue.remove(C); + AvailQueue.push_back(C); + C.SU->NodeQueueId = CurQueueId++; + } + } +} + +/// Move the scheduler state forward by the specified number of Cycles. +void GCNILPScheduler::advanceToCycle(unsigned NextCycle) { + if (NextCycle <= CurCycle) + return; + CurCycle = NextCycle; + releasePending(); +} + +void GCNILPScheduler::releasePredecessors(const SUnit* SU) { + for (const auto &PredEdge : SU->Preds) { + auto PredSU = PredEdge.getSUnit(); + if (PredEdge.isWeak()) + continue; + assert(PredSU->isBoundaryNode() || PredSU->NumSuccsLeft > 0); + + PredSU->setHeightToAtLeast(SU->getHeight() + PredEdge.getLatency()); + + if (!PredSU->isBoundaryNode() && --PredSU->NumSuccsLeft == 0) + PendingQueue.push_front(*new (Alloc.Allocate()) Candidate(PredSU)); + } +} + +std::vector +GCNILPScheduler::schedule(ArrayRef BotRoots, + const ScheduleDAG &DAG) { + auto &SUnits = const_cast(DAG).SUnits; + + std::vector SUSavedCopy; + SUSavedCopy.resize(SUnits.size()); + + // we cannot save only those fields we touch: some of them are private + // so save units verbatim: this assumes SUnit should have value semantics + for (const SUnit &SU : SUnits) + SUSavedCopy[SU.NodeNum] = SU; + + SUNumbers.assign(SUnits.size(), 0); + for (const SUnit &SU : SUnits) + CalcNodeSethiUllmanNumber(&SU, SUNumbers); + + for (auto SU : BotRoots) { + AvailQueue.push_back( + *new (Alloc.Allocate()) Candidate(const_cast(SU))); + } + releasePredecessors(&DAG.ExitSU); + + std::vector Schedule; + Schedule.reserve(SUnits.size()); + while (true) { + if (AvailQueue.empty() && !PendingQueue.empty()) { + auto EarliestSU = std::min_element( + PendingQueue.begin(), PendingQueue.end(), + [=](const Candidate& C1, const Candidate& C2) { + return C1.SU->getHeight() < C2.SU->getHeight(); + })->SU; + advanceToCycle(std::max(CurCycle + 1, EarliestSU->getHeight())); + } + if (AvailQueue.empty()) + break; + + DEBUG( + dbgs() << "\n=== Picking candidate\n" + "Ready queue:"; + for (auto &C : AvailQueue) + dbgs() << ' ' << C.SU->NodeNum; + dbgs() << '\n'; + ); + + auto C = pickCandidate(); + assert(C); + AvailQueue.remove(*C); + auto SU = C->SU; + DEBUG(dbgs() << "Selected "; SU->dump(&DAG)); + + advanceToCycle(SU->getHeight()); + + releasePredecessors(SU); + Schedule.push_back(SU); + SU->isScheduled = true; + } + assert(SUnits.size() == Schedule.size()); + + std::reverse(Schedule.begin(), Schedule.end()); + + // restore units + for (auto &SU : SUnits) + SU = SUSavedCopy[SU.NodeNum]; + + return Schedule; +} + +namespace llvm { +std::vector makeGCNILPScheduler(ArrayRef BotRoots, + const ScheduleDAG &DAG) { + GCNILPScheduler S; + return S.schedule(BotRoots, DAG); +} +} Index: lib/Target/AMDGPU/GCNIterativeScheduler.h =================================================================== --- lib/Target/AMDGPU/GCNIterativeScheduler.h +++ lib/Target/AMDGPU/GCNIterativeScheduler.h @@ -32,7 +32,8 @@ enum StrategyKind { SCHEDULE_MINREGONLY, SCHEDULE_MINREGFORCED, - SCHEDULE_LEGACYMAXOCCUPANCY + SCHEDULE_LEGACYMAXOCCUPANCY, + SCHEDULE_ILP }; GCNIterativeScheduler(MachineSchedContext *C, @@ -108,6 +109,7 @@ void scheduleLegacyMaxOccupancy(bool TryMaximizeOccupancy = true); void scheduleMinReg(bool force = false); + void scheduleILP(bool TryMaximizeOccupancy = true); void printRegions(raw_ostream &OS) const; void printSchedResult(raw_ostream &OS, Index: lib/Target/AMDGPU/GCNIterativeScheduler.cpp =================================================================== --- lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -39,7 +39,9 @@ std::vector makeMinRegSchedule(ArrayRef TopRoots, const ScheduleDAG &DAG); -} // end namespace llvm + std::vector makeGCNILPScheduler(ArrayRef BotRoots, + const ScheduleDAG &DAG); +} // shim accessors for different order containers static inline MachineInstr *getMachineInstr(MachineInstr *MI) { @@ -141,6 +143,7 @@ GCNIterativeScheduler &Sch; SmallVector TopRoots; + SmallVector BotRoots; public: BuildDAG(const Region &R, GCNIterativeScheduler &_Sch) : Sch(_Sch) { @@ -151,8 +154,6 @@ Sch.buildSchedGraph(Sch.AA, nullptr, nullptr, nullptr, /*TrackLaneMask*/true); Sch.Topo.InitDAGTopologicalSorting(); - - SmallVector BotRoots; Sch.findRootsAndBiasEdges(TopRoots, BotRoots); } @@ -164,6 +165,9 @@ ArrayRef getTopRoots() const { return TopRoots; } + ArrayRef getBottomRoots() const { + return BotRoots; + } }; class GCNIterativeScheduler::OverrideLegacyStrategy { @@ -323,6 +327,7 @@ case SCHEDULE_MINREGONLY: scheduleMinReg(); break; case SCHEDULE_MINREGFORCED: scheduleMinReg(true); break; case SCHEDULE_LEGACYMAXOCCUPANCY: scheduleLegacyMaxOccupancy(); break; + case SCHEDULE_ILP: scheduleILP(false); break; } } @@ -553,3 +558,42 @@ MaxPressure = RP; } } + +/////////////////////////////////////////////////////////////////////////////// +// ILP scheduler port + +void GCNIterativeScheduler::scheduleILP( + bool TryMaximizeOccupancy) { + const auto &ST = MF.getSubtarget(); + auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF); + + sortRegionsByPressure(TgtOcc); + auto Occ = Regions.front()->MaxPressure.getOccupancy(ST); + + if (TryMaximizeOccupancy && Occ < TgtOcc) + Occ = tryMaximizeOccupancy(TgtOcc); + + TgtOcc = std::min(Occ, TgtOcc); + DEBUG(dbgs() << "Scheduling using default scheduler, " + "target occupancy = " << TgtOcc << '\n'); + + for (auto R : Regions) { + BuildDAG DAG(*R, *this); + const auto ILPSchedule = makeGCNILPScheduler(DAG.getBottomRoots(), *this); + + const auto RP = getSchedulePressure(*R, ILPSchedule); + DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP)); + + if (RP.getOccupancy(ST) < TgtOcc) { + DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc); + if (R->BestSchedule.get() && + R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) { + DEBUG(dbgs() << ", scheduling minimal register\n"); + scheduleBest(*R); + } + } else { + scheduleRegion(*R, ILPSchedule, RP); + DEBUG(printSchedResult(dbgs(), R, RP)); + } + } +}