Index: llvm/trunk/lib/Target/AMDGPU/AMDGPU.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPU.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPU.h @@ -20,8 +20,10 @@ class AMDGPUSubtarget; class AMDGPUTargetMachine; class FunctionPass; +class MachineSchedContext; class MCAsmInfo; class raw_ostream; +class ScheduleDAGInstrs; class Target; class TargetMachine; @@ -49,6 +51,8 @@ FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); FunctionPass *createSIInsertWaits(TargetMachine &tm); +ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C); + ModulePass *createAMDGPUAnnotateKernelFeaturesPass(); void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); extern char &AMDGPUAnnotateKernelFeaturesID; Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -66,8 +66,12 @@ } static MachineSchedRegistry -SchedCustomRegistry("r600", "Run R600's custom scheduler", - createR600MachineScheduler); +R600SchedRegistry("r600", "Run R600's custom scheduler", + createR600MachineScheduler); + +static MachineSchedRegistry +SISchedRegistry("si", "Run SI's custom scheduler", + createSIMachineScheduler); static std::string computeDataLayout(const Triple &TT) { std::string Ret = "e-p:32:32"; Index: llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt +++ llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt @@ -58,6 +58,7 @@ SILowerControlFlow.cpp SILowerI1Copies.cpp SIMachineFunctionInfo.cpp + SIMachineScheduler.cpp SIRegisterInfo.cpp SIShrinkInstructions.cpp SITypeRewriter.cpp Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h @@ -462,6 +462,9 @@ uint64_t getDefaultRsrcDataFormat() const; uint64_t getScratchRsrcWords23() const; + + bool isLowLatencyInstruction(const MachineInstr *MI) const; + bool isHighLatencyInstruction(const MachineInstr *MI) const; }; namespace AMDGPU { Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3079,3 +3079,15 @@ return Rsrc23; } + +bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr *MI) const { + unsigned Opc = MI->getOpcode(); + + return isSMRD(Opc); +} + +bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr *MI) const { + unsigned Opc = MI->getOpcode(); + + return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); +} Index: llvm/trunk/lib/Target/AMDGPU/SIMachineScheduler.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIMachineScheduler.h +++ llvm/trunk/lib/Target/AMDGPU/SIMachineScheduler.h @@ -0,0 +1,489 @@ +//===-- SIMachineScheduler.h - SI Scheduler Interface -*- C++ -*-------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief SI Machine Scheduler interface +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H +#define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H + +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/RegisterPressure.h" + +using namespace llvm; + +namespace llvm { + +enum SIScheduleCandReason { + NoCand, + RegUsage, + Latency, + Successor, + Depth, + NodeOrder +}; + +struct SISchedulerCandidate { + // The reason for this candidate. + SIScheduleCandReason Reason; + + // Set of reasons that apply to multiple candidates. + uint32_t RepeatReasonSet; + + SISchedulerCandidate() + : Reason(NoCand), RepeatReasonSet(0) {} + + bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); } + void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); } +}; + +class SIScheduleDAGMI; +class SIScheduleBlockCreator; + +class SIScheduleBlock { + SIScheduleDAGMI *DAG; + SIScheduleBlockCreator *BC; + + std::vector SUnits; + std::map NodeNum2Index; + std::vector TopReadySUs; + std::vector ScheduledSUnits; + + /// The top of the unscheduled zone. + IntervalPressure TopPressure; + RegPressureTracker TopRPTracker; + + // Pressure: number of said class of registers needed to + // store the live virtual and real registers. + // We do care only of SGPR32 and VGPR32 and do track only virtual registers. + // Pressure of additional registers required inside the block. + std::vector InternalAdditionnalPressure; + // Pressure of input and output registers + std::vector LiveInPressure; + std::vector LiveOutPressure; + // Registers required by the block, and outputs. + // We do track only virtual registers. + // Note that some registers are not 32 bits, + // and thus the pressure is not equal + // to the number of live registers. + std::set LiveInRegs; + std::set LiveOutRegs; + + bool Scheduled; + bool HighLatencyBlock; + + std::vector HasLowLatencyNonWaitedParent; + + // Unique ID, the index of the Block in the SIScheduleDAGMI Blocks table. + unsigned ID; + + std::vector Preds; // All blocks predecessors. + std::vector Succs; // All blocks successors. + unsigned NumHighLatencySuccessors; + +public: + SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC, + unsigned ID): + DAG(DAG), BC(BC), SUnits(), TopReadySUs(), ScheduledSUnits(), + TopRPTracker(TopPressure), Scheduled(false), + HighLatencyBlock(false), ID(ID), + Preds(), Succs(), NumHighLatencySuccessors(0) {}; + + ~SIScheduleBlock() {}; + + unsigned getID() const { return ID; } + + /// Functions for Block construction. + void addUnit(SUnit *SU); + + // When all SUs have been added. + void finalizeUnits(); + + // Add block pred, which has instruction predecessor of SU. + void addPred(SIScheduleBlock *Pred); + void addSucc(SIScheduleBlock *Succ); + + const std::vector& getPreds() const { return Preds; } + const std::vector& getSuccs() const { return Succs; } + + unsigned Height; // Maximum topdown path length to block without outputs + unsigned Depth; // Maximum bottomup path length to block without inputs + + unsigned getNumHighLatencySuccessors() const { + return NumHighLatencySuccessors; + } + + bool isHighLatencyBlock() { return HighLatencyBlock; } + + // This is approximative. + // Ideally should take into accounts some instructions (rcp, etc) + // are 4 times slower. + int getCost() { return SUnits.size(); } + + // The block Predecessors and Successors must be all registered + // before fastSchedule(). + // Fast schedule with no particular requirement. + void fastSchedule(); + + std::vector getScheduledUnits() { return ScheduledSUnits; } + + // Complete schedule that will try to minimize reg pressure and + // low latencies, and will fill liveins and liveouts. + // Needs all MIs to be grouped between BeginBlock and EndBlock. + // The MIs can be moved after the scheduling, + // it is just used to allow correct track of live registers. + void schedule(MachineBasicBlock::iterator BeginBlock, + MachineBasicBlock::iterator EndBlock); + + bool isScheduled() { return Scheduled; } + + + // Needs the block to be scheduled inside + // TODO: find a way to compute it. + std::vector &getInternalAdditionnalRegUsage() { + return InternalAdditionnalPressure; + } + + std::set &getInRegs() { return LiveInRegs; } + std::set &getOutRegs() { return LiveOutRegs; } + + void printDebug(bool Full); + +private: + struct SISchedCandidate : SISchedulerCandidate { + // The best SUnit candidate. + SUnit *SU; + + unsigned SGPRUsage; + unsigned VGPRUsage; + bool IsLowLatency; + unsigned LowLatencyOffset; + bool HasLowLatencyNonWaitedParent; + + SISchedCandidate() + : SU(nullptr) {} + + bool isValid() const { return SU; } + + // Copy the status of another candidate without changing policy. + void setBest(SISchedCandidate &Best) { + assert(Best.Reason != NoCand && "uninitialized Sched candidate"); + SU = Best.SU; + Reason = Best.Reason; + SGPRUsage = Best.SGPRUsage; + VGPRUsage = Best.VGPRUsage; + IsLowLatency = Best.IsLowLatency; + LowLatencyOffset = Best.LowLatencyOffset; + HasLowLatencyNonWaitedParent = Best.HasLowLatencyNonWaitedParent; + } + }; + + void undoSchedule(); + + void undoReleaseSucc(SUnit *SU, SDep *SuccEdge); + void releaseSucc(SUnit *SU, SDep *SuccEdge); + // InOrOutBlock: restrict to links pointing inside the block (true), + // or restrict to links pointing outside the block (false). + void releaseSuccessors(SUnit *SU, bool InOrOutBlock); + + void nodeScheduled(SUnit *SU); + void tryCandidateTopDown(SISchedCandidate &Cand, SISchedCandidate &TryCand); + void tryCandidateBottomUp(SISchedCandidate &Cand, SISchedCandidate &TryCand); + SUnit* pickNode(); + void traceCandidate(const SISchedCandidate &Cand); + void initRegPressure(MachineBasicBlock::iterator BeginBlock, + MachineBasicBlock::iterator EndBlock); +}; + +struct SIScheduleBlocks { + std::vector Blocks; + std::vector TopDownIndex2Block; + std::vector TopDownBlock2Index; +}; + +enum SISchedulerBlockCreatorVariant { + LatenciesAlone, + LatenciesGrouped, + LatenciesAlonePlusConsecutive +}; + +class SIScheduleBlockCreator { + SIScheduleDAGMI *DAG; + // unique_ptr handles freeing memory for us. + std::vector> BlockPtrs; + std::map Blocks; + std::vector CurrentBlocks; + std::vector Node2CurrentBlock; + + // Topological sort + // Maps topological index to the node number. + std::vector TopDownIndex2Block; + std::vector TopDownBlock2Index; + std::vector BottomUpIndex2Block; + + // 0 -> Color not given. + // 1 to SUnits.size() -> Reserved group (you should only add elements to them). + // Above -> Other groups. + int NextReservedID; + int NextNonReservedID; + std::vector CurrentColoring; + std::vector CurrentTopDownReservedDependencyColoring; + std::vector CurrentBottomUpReservedDependencyColoring; + +public: + SIScheduleBlockCreator(SIScheduleDAGMI *DAG); + ~SIScheduleBlockCreator(); + + SIScheduleBlocks + getBlocks(SISchedulerBlockCreatorVariant BlockVariant); + + bool isSUInBlock(SUnit *SU, unsigned ID); + +private: + // Give a Reserved color to every high latency. + void colorHighLatenciesAlone(); + + // Create groups of high latencies with a Reserved color. + void colorHighLatenciesGroups(); + + // Compute coloring for topdown and bottom traversals with + // different colors depending on dependencies on Reserved colors. + void colorComputeReservedDependencies(); + + // Give color to all non-colored SUs according to Reserved groups dependencies. + void colorAccordingToReservedDependencies(); + + // Divides Blocks having no bottom up or top down dependencies on Reserved groups. + // The new colors are computed according to the dependencies on the other blocks + // formed with colorAccordingToReservedDependencies. + void colorEndsAccordingToDependencies(); + + // Cut groups into groups with SUs in consecutive order (except for Reserved groups). + void colorForceConsecutiveOrderInGroup(); + + // Merge Constant loads that have all their users into another group to the group. + // (TODO: else if all their users depend on the same group, put them there) + void colorMergeConstantLoadsNextGroup(); + + // Merge SUs that have all their users into another group to the group + void colorMergeIfPossibleNextGroup(); + + // Merge SUs that have all their users into another group to the group, + // but only for Reserved groups. + void colorMergeIfPossibleNextGroupOnlyForReserved(); + + // Merge SUs that have all their users into another group to the group, + // but only if the group is no more than a few SUs. + void colorMergeIfPossibleSmallGroupsToNextGroup(); + + // Divides Blocks with important size. + // Idea of implementation: attribute new colors depending on topdown and + // bottom up links to other blocks. + void cutHugeBlocks(); + + // Put in one group all instructions with no users in this scheduling region + // (we'd want these groups be at the end). + void regroupNoUserInstructions(); + + void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant); + + void topologicalSort(); + + void scheduleInsideBlocks(); + + void fillStats(); +}; + +enum SISchedulerBlockSchedulerVariant { + BlockLatencyRegUsage, + BlockRegUsageLatency, + BlockRegUsage +}; + +class SIScheduleBlockScheduler { + SIScheduleDAGMI *DAG; + SISchedulerBlockSchedulerVariant Variant; + std::vector Blocks; + + std::vector> LiveOutRegsNumUsages; + std::set LiveRegs; + // Num of schedulable unscheduled blocks reading the register. + std::map LiveRegsConsumers; + + std::vector LastPosHighLatencyParentScheduled; + int LastPosWaitedHighLatency; + + std::vector BlocksScheduled; + unsigned NumBlockScheduled; + std::vector ReadyBlocks; + + unsigned VregCurrentUsage; + unsigned SregCurrentUsage; + + // Currently is only approximation. + unsigned maxVregUsage; + unsigned maxSregUsage; + + std::vector BlockNumPredsLeft; + std::vector BlockNumSuccsLeft; + +public: + SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, + SISchedulerBlockSchedulerVariant Variant, + SIScheduleBlocks BlocksStruct); + ~SIScheduleBlockScheduler() {}; + + std::vector getBlocks() { return BlocksScheduled; }; + + unsigned getVGPRUsage() { return maxVregUsage; }; + unsigned getSGPRUsage() { return maxSregUsage; }; + +private: + struct SIBlockSchedCandidate : SISchedulerCandidate { + // The best Block candidate. + SIScheduleBlock *Block; + + bool IsHighLatency; + int VGPRUsageDiff; + unsigned NumSuccessors; + unsigned NumHighLatencySuccessors; + unsigned LastPosHighLatParentScheduled; + unsigned Height; + + SIBlockSchedCandidate() + : Block(nullptr) {} + + bool isValid() const { return Block; } + + // Copy the status of another candidate without changing policy. + void setBest(SIBlockSchedCandidate &Best) { + assert(Best.Reason != NoCand && "uninitialized Sched candidate"); + Block = Best.Block; + Reason = Best.Reason; + IsHighLatency = Best.IsHighLatency; + VGPRUsageDiff = Best.VGPRUsageDiff; + NumSuccessors = Best.NumSuccessors; + NumHighLatencySuccessors = Best.NumHighLatencySuccessors; + LastPosHighLatParentScheduled = Best.LastPosHighLatParentScheduled; + Height = Best.Height; + } + }; + + bool tryCandidateLatency(SIBlockSchedCandidate &Cand, + SIBlockSchedCandidate &TryCand); + bool tryCandidateRegUsage(SIBlockSchedCandidate &Cand, + SIBlockSchedCandidate &TryCand); + SIScheduleBlock *pickBlock(); + + void addLiveRegs(std::set &Regs); + void decreaseLiveRegs(SIScheduleBlock *Block, std::set &Regs); + void releaseBlockSuccs(SIScheduleBlock *Parent); + void blockScheduled(SIScheduleBlock *Block); + + // Check register pressure change + // by scheduling a block with these LiveIn and LiveOut. + std::vector checkRegUsageImpact(std::set &InRegs, + std::set &OutRegs); + + void schedule(); +}; + +struct SIScheduleBlockResult { + std::vector SUs; + unsigned MaxSGPRUsage; + unsigned MaxVGPRUsage; +}; + +class SIScheduler { + SIScheduleDAGMI *DAG; + SIScheduleBlockCreator BlockCreator; + +public: + SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {}; + + ~SIScheduler() {}; + + struct SIScheduleBlockResult + scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, + SISchedulerBlockSchedulerVariant ScheduleVariant); +}; + +class SIScheduleDAGMI : public ScheduleDAGMILive { + const SIInstrInfo *SITII; + const SIRegisterInfo *SITRI; + + std::vector SUnitsLinksBackup; + + // For moveLowLatencies. After all Scheduling variants are tested. + std::vector ScheduledSUnits; + std::vector ScheduledSUnitsInv; + + unsigned VGPRSetID; + unsigned SGPRSetID; + +public: + SIScheduleDAGMI(MachineSchedContext *C); + + ~SIScheduleDAGMI() override; + + // Entry point for the schedule. + void schedule() override; + + // To init Block's RPTracker. + void initRPTracker(RegPressureTracker &RPTracker) { + RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin); + } + + MachineBasicBlock *getBB() { return BB; } + MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; }; + MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; }; + LiveIntervals *getLIS() { return LIS; } + MachineRegisterInfo *getMRI() { return &MRI; } + const TargetRegisterInfo *getTRI() { return TRI; } + SUnit& getEntrySU() { return EntrySU; }; + SUnit& getExitSU() { return ExitSU; }; + + void restoreSULinksLeft(); + + template void fillVgprSgprCost(_Iterator First, + _Iterator End, + unsigned &VgprUsage, + unsigned &SgprUsage); + std::set getInRegs() { + std::set InRegs (RPTracker.getPressure().LiveInRegs.begin(), + RPTracker.getPressure().LiveInRegs.end()); + return InRegs; + }; + + unsigned getVGPRSetID() const { return VGPRSetID; } + unsigned getSGPRSetID() const { return SGPRSetID; } + +private: + void topologicalSort(); + // After scheduling is done, improve low latency placements. + void moveLowLatencies(); + +public: + // Some stats for scheduling inside blocks. + std::vector IsLowLatencySU; + std::vector LowLatencyOffset; + std::vector IsHighLatencySU; + // Topological sort + // Maps topological index to the node number. + std::vector TopDownIndex2SU; + std::vector BottomUpIndex2SU; +}; + +} // namespace llvm + +#endif /* SIMACHINESCHEDULER_H_ */ Index: llvm/trunk/lib/Target/AMDGPU/SIMachineScheduler.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -0,0 +1,1968 @@ +//===-- SIMachineScheduler.cpp - SI Scheduler Interface -*- C++ -*-----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief SI Machine Scheduler interface +// +//===----------------------------------------------------------------------===// + +#include "SIMachineScheduler.h" +#include "AMDGPUSubtarget.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/RegisterPressure.h" + +using namespace llvm; + +#define DEBUG_TYPE "misched" + +// This scheduler implements a different scheduling algorithm than +// GenericScheduler. +// +// There are several specific architecture behaviours that can't be modelled +// for GenericScheduler: +// . When accessing the result of an SGPR load instruction, you have to wait +// for all the SGPR load instructions before your current instruction to +// have finished. +// . When accessing the result of an VGPR load instruction, you have to wait +// for all the VGPR load instructions previous to the VGPR load instruction +// you are interested in to finish. +// . The less the register pressure, the best load latencies are hidden +// +// Moreover some specifities (like the fact a lot of instructions in the shader +// have few dependencies) makes the generic scheduler have some unpredictable +// behaviours. For example when register pressure becomes high, it can either +// manage to prevent register pressure from going too high, or it can +// increase register pressure even more than if it hadn't taken register +// pressure into account. +// +// Also some other bad behaviours are generated, like loading at the beginning +// of the shader a constant in VGPR you won't need until the end of the shader. +// +// The scheduling problem for SI can distinguish three main parts: +// . Hiding high latencies (texture sampling, etc) +// . Hiding low latencies (SGPR constant loading, etc) +// . Keeping register usage low for better latency hiding and general +// performance +// +// Some other things can also affect performance, but are hard to predict +// (cache usage, the fact the HW can issue several instructions from different +// wavefronts if different types, etc) +// +// This scheduler tries to solve the scheduling problem by dividing it into +// simpler sub-problems. It divides the instructions into blocks, schedules +// locally inside the blocks where it takes care of low latencies, and then +// chooses the order of the blocks by taking care of high latencies. +// Dividing the instructions into blocks helps control keeping register +// usage low. +// +// First the instructions are put into blocks. +// We want the blocks help control register usage and hide high latencies +// later. To help control register usage, we typically want all local +// computations, when for example you create a result that can be comsummed +// right away, to be contained in a block. Block inputs and outputs would +// typically be important results that are needed in several locations of +// the shader. Since we do want blocks to help hide high latencies, we want +// the instructions inside the block to have a minimal set of dependencies +// on high latencies. It will make it easy to pick blocks to hide specific +// high latencies. +// The block creation algorithm is divided into several steps, and several +// variants can be tried during the scheduling process. +// +// Second the order of the instructions inside the blocks is choosen. +// At that step we do take into account only register usage and hiding +// low latency instructions +// +// Third the block order is choosen, there we try to hide high latencies +// and keep register usage low. +// +// After the third step, a pass is done to improve the hiding of low +// latencies. +// +// Actually when talking about 'low latency' or 'high latency' it includes +// both the latency to get the cache (or global mem) data go to the register, +// and the bandwith limitations. +// Increasing the number of active wavefronts helps hide the former, but it +// doesn't solve the latter, thus why even if wavefront count is high, we have +// to try have as many instructions hiding high latencies as possible. +// The OpenCL doc says for example latency of 400 cycles for a global mem access, +// which is hidden by 10 instructions if the wavefront count is 10. + +// Some figures taken from AMD docs: +// Both texture and constant L1 caches are 4-way associative with 64 bytes +// lines. +// Constant cache is shared with 4 CUs. +// For texture sampling, the address generation unit receives 4 texture +// addresses per cycle, thus we could expect texture sampling latency to be +// equivalent to 4 instructions in the very best case (a VGPR is 64 work items, +// instructions in a wavefront group are executed every 4 cycles), +// or 16 instructions if the other wavefronts associated to the 3 other VALUs +// of the CU do texture sampling too. (Don't take these figures too seriously, +// as I'm not 100% sure of the computation) +// Data exports should get similar latency. +// For constant loading, the cache is shader with 4 CUs. +// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit" +// I guess if the other CU don't read the cache, it can go up to 64B/cycle. +// It means a simple s_buffer_load should take one instruction to hide, as +// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same +// cache line. +// +// As of today the driver doesn't preload the constants in cache, thus the +// first loads get extra latency. The doc says global memory access can be +// 300-600 cycles. We do not specially take that into account when scheduling +// As we expect the driver to be able to preload the constants soon. + + +// common code // + +#ifndef NDEBUG + +static const char *getReasonStr(SIScheduleCandReason Reason) { + switch (Reason) { + case NoCand: return "NOCAND"; + case RegUsage: return "REGUSAGE"; + case Latency: return "LATENCY"; + case Successor: return "SUCCESSOR"; + case Depth: return "DEPTH"; + case NodeOrder: return "ORDER"; + } + llvm_unreachable("Unknown reason!"); +} + +#endif + +static bool tryLess(int TryVal, int CandVal, + SISchedulerCandidate &TryCand, + SISchedulerCandidate &Cand, + SIScheduleCandReason Reason) { + if (TryVal < CandVal) { + TryCand.Reason = Reason; + return true; + } + if (TryVal > CandVal) { + if (Cand.Reason > Reason) + Cand.Reason = Reason; + return true; + } + Cand.setRepeat(Reason); + return false; +} + +static bool tryGreater(int TryVal, int CandVal, + SISchedulerCandidate &TryCand, + SISchedulerCandidate &Cand, + SIScheduleCandReason Reason) { + if (TryVal > CandVal) { + TryCand.Reason = Reason; + return true; + } + if (TryVal < CandVal) { + if (Cand.Reason > Reason) + Cand.Reason = Reason; + return true; + } + Cand.setRepeat(Reason); + return false; +} + +// SIScheduleBlock // + +void SIScheduleBlock::addUnit(SUnit *SU) { + NodeNum2Index[SU->NodeNum] = SUnits.size(); + SUnits.push_back(SU); +} + +#ifndef NDEBUG + +void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) { + + dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason); + dbgs() << '\n'; +} +#endif + +void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand, + SISchedCandidate &TryCand) { + // Initialize the candidate if needed. + if (!Cand.isValid()) { + TryCand.Reason = NodeOrder; + return; + } + + if (Cand.SGPRUsage > 60 && + tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage)) + return; + + // Schedule low latency instructions as top as possible. + // Order of priority is: + // . Low latency instructions which do not depend on other low latency + // instructions we haven't waited for + // . Other instructions which do not depend on low latency instructions + // we haven't waited for + // . Low latencies + // . All other instructions + // Goal is to get: low latency instructions - independant instructions + // - (eventually some more low latency instructions) + // - instructions that depend on the first low latency instructions. + // If in the block there is a lot of constant loads, the SGPR usage + // could go quite high, thus above the arbitrary limit of 60 will encourage + // use the already loaded constants (in order to release some SGPRs) before + // loading more. + if (tryLess(TryCand.HasLowLatencyNonWaitedParent, + Cand.HasLowLatencyNonWaitedParent, + TryCand, Cand, SIScheduleCandReason::Depth)) + return; + + if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency, + TryCand, Cand, SIScheduleCandReason::Depth)) + return; + + if (TryCand.IsLowLatency && + tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset, + TryCand, Cand, SIScheduleCandReason::Depth)) + return; + + if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage)) + return; + + // Fall through to original instruction order. + if (TryCand.SU->NodeNum < Cand.SU->NodeNum) { + TryCand.Reason = NodeOrder; + } +} + +SUnit* SIScheduleBlock::pickNode() { + SISchedCandidate TopCand; + + for (SUnit* SU : TopReadySUs) { + SISchedCandidate TryCand; + std::vector pressure; + std::vector MaxPressure; + // Predict register usage after this instruction. + TryCand.SU = SU; + TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure); + TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()]; + TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()]; + TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum]; + TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum]; + TryCand.HasLowLatencyNonWaitedParent = + HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]; + tryCandidateTopDown(TopCand, TryCand); + if (TryCand.Reason != NoCand) + TopCand.setBest(TryCand); + } + + return TopCand.SU; +} + + +// Schedule something valid. +void SIScheduleBlock::fastSchedule() { + TopReadySUs.clear(); + if (Scheduled) + undoSchedule(); + + for (SUnit* SU : SUnits) { + if (!SU->NumPredsLeft) + TopReadySUs.push_back(SU); + } + + while (!TopReadySUs.empty()) { + SUnit *SU = TopReadySUs[0]; + ScheduledSUnits.push_back(SU); + nodeScheduled(SU); + } + + Scheduled = true; +} + +// Returns if the register was set between first and last. +static bool isDefBetween(unsigned Reg, + SlotIndex First, SlotIndex Last, + const MachineRegisterInfo *MRI, + const LiveIntervals *LIS) { + for (MachineRegisterInfo::def_instr_iterator + UI = MRI->def_instr_begin(Reg), + UE = MRI->def_instr_end(); UI != UE; ++UI) { + const MachineInstr* MI = &*UI; + if (MI->isDebugValue()) + continue; + SlotIndex InstSlot = LIS->getInstructionIndex(MI).getRegSlot(); + if (InstSlot >= First && InstSlot <= Last) + return true; + } + return false; +} + +void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, + MachineBasicBlock::iterator EndBlock) { + IntervalPressure Pressure, BotPressure; + RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure); + LiveIntervals *LIS = DAG->getLIS(); + MachineRegisterInfo *MRI = DAG->getMRI(); + DAG->initRPTracker(TopRPTracker); + DAG->initRPTracker(BotRPTracker); + DAG->initRPTracker(RPTracker); + + // Goes though all SU. RPTracker captures what had to be alive for the SUs + // to execute, and what is still alive at the end. + for (SUnit* SU : ScheduledSUnits) { + RPTracker.setPos(SU->getInstr()); + RPTracker.advance(); + } + + // Close the RPTracker to finalize live ins/outs. + RPTracker.closeRegion(); + + // Initialize the live ins and live outs. + TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs); + BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs); + + // Do not Track Physical Registers, because it messes up. + for (unsigned Reg : RPTracker.getPressure().LiveInRegs) { + if (TargetRegisterInfo::isVirtualRegister(Reg)) + LiveInRegs.insert(Reg); + } + LiveOutRegs.clear(); + // There is several possibilities to distinguish: + // 1) Reg is not input to any instruction in the block, but is output of one + // 2) 1) + read in the block and not needed after it + // 3) 1) + read in the block but needed in another block + // 4) Reg is input of an instruction but another block will read it too + // 5) Reg is input of an instruction and then rewritten in the block. + // result is not read in the block (implies used in another block) + // 6) Reg is input of an instruction and then rewritten in the block. + // result is read in the block and not needed in another block + // 7) Reg is input of an instruction and then rewritten in the block. + // result is read in the block but also needed in another block + // LiveInRegs will contains all the regs in situation 4, 5, 6, 7 + // We want LiveOutRegs to contain only Regs whose content will be read after + // in another block, and whose content was written in the current block, + // that is we want it to get 1, 3, 5, 7 + // Since we made the MIs of a block to be packed all together before + // scheduling, then the LiveIntervals were correct, and the RPTracker was + // able to correctly handle 5 vs 6, 2 vs 3. + // (Note: This is not sufficient for RPTracker to not do mistakes for case 4) + // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7 + // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7 + // The use of findDefBetween removes the case 4. + for (unsigned Reg : RPTracker.getPressure().LiveOutRegs) { + if (TargetRegisterInfo::isVirtualRegister(Reg) && + isDefBetween(Reg, LIS->getInstructionIndex(BeginBlock).getRegSlot(), + LIS->getInstructionIndex(EndBlock).getRegSlot(), + MRI, LIS)) { + LiveOutRegs.insert(Reg); + } + } + + // Pressure = sum_alive_registers register size + // Internally llvm will represent some registers as big 128 bits registers + // for example, but they actually correspond to 4 actual 32 bits registers. + // Thus Pressure is not equal to num_alive_registers * constant. + LiveInPressure = TopPressure.MaxSetPressure; + LiveOutPressure = BotPressure.MaxSetPressure; + + // Prepares TopRPTracker for top down scheduling. + TopRPTracker.closeTop(); +} + +void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock, + MachineBasicBlock::iterator EndBlock) { + if (!Scheduled) + fastSchedule(); + + // PreScheduling phase to set LiveIn and LiveOut. + initRegPressure(BeginBlock, EndBlock); + undoSchedule(); + + // Schedule for real now. + + TopReadySUs.clear(); + + for (SUnit* SU : SUnits) { + if (!SU->NumPredsLeft) + TopReadySUs.push_back(SU); + } + + while (!TopReadySUs.empty()) { + SUnit *SU = pickNode(); + ScheduledSUnits.push_back(SU); + TopRPTracker.setPos(SU->getInstr()); + TopRPTracker.advance(); + nodeScheduled(SU); + } + + // TODO: compute InternalAdditionnalPressure. + InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size()); + + // Check everything is right. +#ifndef NDEBUG + assert(SUnits.size() == ScheduledSUnits.size() && + TopReadySUs.empty()); + for (SUnit* SU : SUnits) { + assert(SU->isScheduled && + SU->NumPredsLeft == 0); + } +#endif + + Scheduled = true; +} + +void SIScheduleBlock::undoSchedule() { + for (SUnit* SU : SUnits) { + SU->isScheduled = false; + for (SDep& Succ : SU->Succs) { + if (BC->isSUInBlock(Succ.getSUnit(), ID)) + undoReleaseSucc(SU, &Succ); + } + } + HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); + ScheduledSUnits.clear(); + Scheduled = false; +} + +void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) { + SUnit *SuccSU = SuccEdge->getSUnit(); + + if (SuccEdge->isWeak()) { + ++SuccSU->WeakPredsLeft; + return; + } + ++SuccSU->NumPredsLeft; +} + +void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) { + SUnit *SuccSU = SuccEdge->getSUnit(); + + if (SuccEdge->isWeak()) { + --SuccSU->WeakPredsLeft; + return; + } +#ifndef NDEBUG + if (SuccSU->NumPredsLeft == 0) { + dbgs() << "*** Scheduling failed! ***\n"; + SuccSU->dump(DAG); + dbgs() << " has been released too many times!\n"; + llvm_unreachable(nullptr); + } +#endif + + --SuccSU->NumPredsLeft; +} + +/// Release Successors of the SU that are in the block or not. +void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) { + for (SDep& Succ : SU->Succs) { + SUnit *SuccSU = Succ.getSUnit(); + + if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock) + continue; + + releaseSucc(SU, &Succ); + if (SuccSU->NumPredsLeft == 0 && InOrOutBlock) + TopReadySUs.push_back(SuccSU); + } +} + +void SIScheduleBlock::nodeScheduled(SUnit *SU) { + // Is in TopReadySUs + assert (!SU->NumPredsLeft); + std::vector::iterator I = + std::find(TopReadySUs.begin(), TopReadySUs.end(), SU); + if (I == TopReadySUs.end()) { + dbgs() << "Data Structure Bug in SI Scheduler\n"; + llvm_unreachable(nullptr); + } + TopReadySUs.erase(I); + + releaseSuccessors(SU, true); + // Scheduling this node will trigger a wait, + // thus propagate to other instructions that they do not need to wait either. + if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]) + HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); + + if (DAG->IsLowLatencySU[SU->NodeNum]) { + for (SDep& Succ : SU->Succs) { + std::map::iterator I = + NodeNum2Index.find(Succ.getSUnit()->NodeNum); + if (I != NodeNum2Index.end()) + HasLowLatencyNonWaitedParent[I->second] = 1; + } + } + SU->isScheduled = true; +} + +void SIScheduleBlock::finalizeUnits() { + // We remove links from outside blocks to enable scheduling inside the block. + for (SUnit* SU : SUnits) { + releaseSuccessors(SU, false); + if (DAG->IsHighLatencySU[SU->NodeNum]) + HighLatencyBlock = true; + } + HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0); +} + +// we maintain ascending order of IDs +void SIScheduleBlock::addPred(SIScheduleBlock *Pred) { + unsigned PredID = Pred->getID(); + + // Check if not already predecessor. + for (SIScheduleBlock* P : Preds) { + if (PredID == P->getID()) + return; + } + Preds.push_back(Pred); + +#ifndef NDEBUG + for (SIScheduleBlock* S : Succs) { + if (PredID == S->getID()) + assert(!"Loop in the Block Graph!\n"); + } +#endif +} + +void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) { + unsigned SuccID = Succ->getID(); + + // Check if not already predecessor. + for (SIScheduleBlock* S : Succs) { + if (SuccID == S->getID()) + return; + } + if (Succ->isHighLatencyBlock()) + ++NumHighLatencySuccessors; + Succs.push_back(Succ); +#ifndef NDEBUG + for (SIScheduleBlock* P : Preds) { + if (SuccID == P->getID()) + assert("Loop in the Block Graph!\n"); + } +#endif +} + +#ifndef NDEBUG +void SIScheduleBlock::printDebug(bool full) { + dbgs() << "Block (" << ID << ")\n"; + if (!full) + return; + + dbgs() << "\nContains High Latency Instruction: " + << HighLatencyBlock << '\n'; + dbgs() << "\nDepends On:\n"; + for (SIScheduleBlock* P : Preds) { + P->printDebug(false); + } + + dbgs() << "\nSuccessors:\n"; + for (SIScheduleBlock* S : Succs) { + S->printDebug(false); + } + + if (Scheduled) { + dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' ' + << LiveInPressure[DAG->getVGPRSetID()] << '\n'; + dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' ' + << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n"; + dbgs() << "LiveIns:\n"; + for (unsigned Reg : LiveInRegs) + dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' '; + + dbgs() << "\nLiveOuts:\n"; + for (unsigned Reg : LiveOutRegs) + dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' '; + } + + dbgs() << "\nInstructions:\n"; + if (!Scheduled) { + for (SUnit* SU : SUnits) { + SU->dump(DAG); + } + } else { + for (SUnit* SU : SUnits) { + SU->dump(DAG); + } + } + + dbgs() << "///////////////////////\n"; +} + +#endif + +// SIScheduleBlockCreator // + +SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) : +DAG(DAG) { +} + +SIScheduleBlockCreator::~SIScheduleBlockCreator() { +} + +SIScheduleBlocks +SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) { + std::map::iterator B = + Blocks.find(BlockVariant); + if (B == Blocks.end()) { + SIScheduleBlocks Res; + createBlocksForVariant(BlockVariant); + topologicalSort(); + scheduleInsideBlocks(); + fillStats(); + Res.Blocks = CurrentBlocks; + Res.TopDownIndex2Block = TopDownIndex2Block; + Res.TopDownBlock2Index = TopDownBlock2Index; + Blocks[BlockVariant] = Res; + return Res; + } else { + return B->second; + } +} + +bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) { + if (SU->NodeNum >= DAG->SUnits.size()) + return false; + return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID; +} + +void SIScheduleBlockCreator::colorHighLatenciesAlone() { + unsigned DAGSize = DAG->SUnits.size(); + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[i]; + if (DAG->IsHighLatencySU[SU->NodeNum]) { + CurrentColoring[SU->NodeNum] = NextReservedID++; + } + } +} + +void SIScheduleBlockCreator::colorHighLatenciesGroups() { + unsigned DAGSize = DAG->SUnits.size(); + unsigned NumHighLatencies = 0; + unsigned GroupSize; + unsigned Color = NextReservedID; + unsigned Count = 0; + std::set FormingGroup; + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[i]; + if (DAG->IsHighLatencySU[SU->NodeNum]) + ++NumHighLatencies; + } + + if (NumHighLatencies == 0) + return; + + if (NumHighLatencies <= 6) + GroupSize = 2; + else if (NumHighLatencies <= 12) + GroupSize = 3; + else + GroupSize = 4; + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[i]; + if (DAG->IsHighLatencySU[SU->NodeNum]) { + unsigned CompatibleGroup = true; + unsigned ProposedColor = Color; + for (unsigned j : FormingGroup) { + // TODO: Currently CompatibleGroup will always be false, + // because the graph enforces the load order. This + // can be fixed, but as keeping the load order is often + // good for performance that causes a performance hit (both + // the default scheduler and this scheduler). + // When this scheduler determines a good load order, + // this can be fixed. + if (!DAG->canAddEdge(SU, &DAG->SUnits[j]) || + !DAG->canAddEdge(&DAG->SUnits[j], SU)) + CompatibleGroup = false; + } + if (!CompatibleGroup || ++Count == GroupSize) { + FormingGroup.clear(); + Color = ++NextReservedID; + if (!CompatibleGroup) { + ProposedColor = Color; + FormingGroup.insert(SU->NodeNum); + } + Count = 0; + } else { + FormingGroup.insert(SU->NodeNum); + } + CurrentColoring[SU->NodeNum] = ProposedColor; + } + } +} + +void SIScheduleBlockCreator::colorComputeReservedDependencies() { + unsigned DAGSize = DAG->SUnits.size(); + std::map, unsigned> ColorCombinations; + + CurrentTopDownReservedDependencyColoring.clear(); + CurrentBottomUpReservedDependencyColoring.clear(); + + CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0); + CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0); + + // Traverse TopDown, and give different colors to SUs depending + // on which combination of High Latencies they depend on. + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->TopDownIndex2SU[i]]; + std::set SUColors; + + // Already given. + if (CurrentColoring[SU->NodeNum]) { + CurrentTopDownReservedDependencyColoring[SU->NodeNum] = + CurrentColoring[SU->NodeNum]; + continue; + } + + for (SDep& PredDep : SU->Preds) { + SUnit *Pred = PredDep.getSUnit(); + if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) + continue; + if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0) + SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]); + } + // Color 0 by default. + if (SUColors.empty()) + continue; + // Same color than parents. + if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) + CurrentTopDownReservedDependencyColoring[SU->NodeNum] = + *SUColors.begin(); + else { + std::map, unsigned>::iterator Pos = + ColorCombinations.find(SUColors); + if (Pos != ColorCombinations.end()) { + CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second; + } else { + CurrentTopDownReservedDependencyColoring[SU->NodeNum] = + NextNonReservedID; + ColorCombinations[SUColors] = NextNonReservedID++; + } + } + } + + ColorCombinations.clear(); + + // Same as before, but BottomUp. + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + std::set SUColors; + + // Already given. + if (CurrentColoring[SU->NodeNum]) { + CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = + CurrentColoring[SU->NodeNum]; + continue; + } + + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0) + SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]); + } + // Keep color 0. + if (SUColors.empty()) + continue; + // Same color than parents. + if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) + CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = + *SUColors.begin(); + else { + std::map, unsigned>::iterator Pos = + ColorCombinations.find(SUColors); + if (Pos != ColorCombinations.end()) { + CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second; + } else { + CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = + NextNonReservedID; + ColorCombinations[SUColors] = NextNonReservedID++; + } + } + } +} + +void SIScheduleBlockCreator::colorAccordingToReservedDependencies() { + unsigned DAGSize = DAG->SUnits.size(); + std::map, unsigned> ColorCombinations; + + // Every combination of colors given by the top down + // and bottom up Reserved node dependency + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[i]; + std::pair SUColors; + + // High latency instructions: already given. + if (CurrentColoring[SU->NodeNum]) + continue; + + SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum]; + SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum]; + + std::map, unsigned>::iterator Pos = + ColorCombinations.find(SUColors); + if (Pos != ColorCombinations.end()) { + CurrentColoring[SU->NodeNum] = Pos->second; + } else { + CurrentColoring[SU->NodeNum] = NextNonReservedID; + ColorCombinations[SUColors] = NextNonReservedID++; + } + } +} + +void SIScheduleBlockCreator::colorEndsAccordingToDependencies() { + unsigned DAGSize = DAG->SUnits.size(); + std::vector PendingColoring = CurrentColoring; + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + std::set SUColors; + std::set SUColorsPending; + + if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) + continue; + + if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 || + CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0) + continue; + + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 || + CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0) + SUColors.insert(CurrentColoring[Succ->NodeNum]); + SUColorsPending.insert(PendingColoring[Succ->NodeNum]); + } + if (SUColors.size() == 1 && SUColorsPending.size() == 1) + PendingColoring[SU->NodeNum] = *SUColors.begin(); + else // TODO: Attribute new colors depending on color + // combination of children. + PendingColoring[SU->NodeNum] = NextNonReservedID++; + } + CurrentColoring = PendingColoring; +} + + +void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() { + unsigned DAGSize = DAG->SUnits.size(); + unsigned PreviousColor; + std::set SeenColors; + + if (DAGSize <= 1) + return; + + PreviousColor = CurrentColoring[0]; + + for (unsigned i = 1, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[i]; + unsigned CurrentColor = CurrentColoring[i]; + unsigned PreviousColorSave = PreviousColor; + assert(i == SU->NodeNum); + + if (CurrentColor != PreviousColor) + SeenColors.insert(PreviousColor); + PreviousColor = CurrentColor; + + if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) + continue; + + if (SeenColors.find(CurrentColor) == SeenColors.end()) + continue; + + if (PreviousColorSave != CurrentColor) + CurrentColoring[i] = NextNonReservedID++; + else + CurrentColoring[i] = CurrentColoring[i-1]; + } +} + +void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() { + unsigned DAGSize = DAG->SUnits.size(); + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + std::set SUColors; + + if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) + continue; + + // No predecessor: Vgpr constant loading. + // Low latency instructions usually have a predecessor (the address) + if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum]) + continue; + + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + SUColors.insert(CurrentColoring[Succ->NodeNum]); + } + if (SUColors.size() == 1) + CurrentColoring[SU->NodeNum] = *SUColors.begin(); + } +} + +void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() { + unsigned DAGSize = DAG->SUnits.size(); + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + std::set SUColors; + + if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) + continue; + + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + SUColors.insert(CurrentColoring[Succ->NodeNum]); + } + if (SUColors.size() == 1) + CurrentColoring[SU->NodeNum] = *SUColors.begin(); + } +} + +void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() { + unsigned DAGSize = DAG->SUnits.size(); + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + std::set SUColors; + + if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) + continue; + + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + SUColors.insert(CurrentColoring[Succ->NodeNum]); + } + if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize) + CurrentColoring[SU->NodeNum] = *SUColors.begin(); + } +} + +void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() { + unsigned DAGSize = DAG->SUnits.size(); + std::map ColorCount; + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + unsigned color = CurrentColoring[SU->NodeNum]; + std::map::iterator Pos = ColorCount.find(color); + if (Pos != ColorCount.end()) { + ++ColorCount[color]; + } else { + ColorCount[color] = 1; + } + } + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + unsigned color = CurrentColoring[SU->NodeNum]; + std::set SUColors; + + if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) + continue; + + if (ColorCount[color] > 1) + continue; + + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + SUColors.insert(CurrentColoring[Succ->NodeNum]); + } + if (SUColors.size() == 1 && *SUColors.begin() != color) { + --ColorCount[color]; + CurrentColoring[SU->NodeNum] = *SUColors.begin(); + ++ColorCount[*SUColors.begin()]; + } + } +} + +void SIScheduleBlockCreator::cutHugeBlocks() { + // TODO +} + +void SIScheduleBlockCreator::regroupNoUserInstructions() { + unsigned DAGSize = DAG->SUnits.size(); + int GroupID = NextNonReservedID++; + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + bool hasSuccessor = false; + + if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) + continue; + + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + hasSuccessor = true; + } + if (!hasSuccessor) + CurrentColoring[SU->NodeNum] = GroupID; + } +} + +void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) { + unsigned DAGSize = DAG->SUnits.size(); + std::map RealID; + + CurrentBlocks.clear(); + CurrentColoring.clear(); + CurrentColoring.resize(DAGSize, 0); + Node2CurrentBlock.clear(); + + // Restore links previous scheduling variant has overridden. + DAG->restoreSULinksLeft(); + + NextReservedID = 1; + NextNonReservedID = DAGSize + 1; + + DEBUG(dbgs() << "Coloring the graph\n"); + + if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped) + colorHighLatenciesGroups(); + else + colorHighLatenciesAlone(); + colorComputeReservedDependencies(); + colorAccordingToReservedDependencies(); + colorEndsAccordingToDependencies(); + if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive) + colorForceConsecutiveOrderInGroup(); + regroupNoUserInstructions(); + colorMergeConstantLoadsNextGroup(); + colorMergeIfPossibleNextGroupOnlyForReserved(); + + // Put SUs of same color into same block + Node2CurrentBlock.resize(DAGSize, -1); + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[i]; + unsigned Color = CurrentColoring[SU->NodeNum]; + if (RealID.find(Color) == RealID.end()) { + int ID = CurrentBlocks.size(); + BlockPtrs.push_back( + make_unique(DAG, this, ID)); + CurrentBlocks.push_back(BlockPtrs.rbegin()->get()); + RealID[Color] = ID; + } + CurrentBlocks[RealID[Color]]->addUnit(SU); + Node2CurrentBlock[SU->NodeNum] = RealID[Color]; + } + + // Build dependencies between blocks. + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[i]; + int SUID = Node2CurrentBlock[i]; + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + if (Node2CurrentBlock[Succ->NodeNum] != SUID) + CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]]); + } + for (SDep& PredDep : SU->Preds) { + SUnit *Pred = PredDep.getSUnit(); + if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) + continue; + if (Node2CurrentBlock[Pred->NodeNum] != SUID) + CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]); + } + } + + // Free root and leafs of all blocks to enable scheduling inside them. + for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + Block->finalizeUnits(); + } + DEBUG( + dbgs() << "Blocks created:\n\n"; + for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + Block->printDebug(true); + } + ); +} + +// Two functions taken from Codegen/MachineScheduler.cpp + +/// If this iterator is a debug value, increment until reaching the End or a +/// non-debug instruction. +static MachineBasicBlock::const_iterator +nextIfDebug(MachineBasicBlock::const_iterator I, + MachineBasicBlock::const_iterator End) { + for(; I != End; ++I) { + if (!I->isDebugValue()) + break; + } + return I; +} + +/// Non-const version. +static MachineBasicBlock::iterator +nextIfDebug(MachineBasicBlock::iterator I, + MachineBasicBlock::const_iterator End) { + // Cast the return value to nonconst MachineInstr, then cast to an + // instr_iterator, which does not check for null, finally return a + // bundle_iterator. + return MachineBasicBlock::instr_iterator( + const_cast( + &*nextIfDebug(MachineBasicBlock::const_iterator(I), End))); +} + +void SIScheduleBlockCreator::topologicalSort() { + unsigned DAGSize = CurrentBlocks.size(); + std::vector WorkList; + + DEBUG(dbgs() << "Topological Sort\n"); + + WorkList.reserve(DAGSize); + TopDownIndex2Block.resize(DAGSize); + TopDownBlock2Index.resize(DAGSize); + BottomUpIndex2Block.resize(DAGSize); + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + unsigned Degree = Block->getSuccs().size(); + TopDownBlock2Index[i] = Degree; + if (Degree == 0) { + WorkList.push_back(i); + } + } + + int Id = DAGSize; + while (!WorkList.empty()) { + int i = WorkList.back(); + SIScheduleBlock *Block = CurrentBlocks[i]; + WorkList.pop_back(); + TopDownBlock2Index[i] = --Id; + TopDownIndex2Block[Id] = i; + for (SIScheduleBlock* Pred : Block->getPreds()) { + if (!--TopDownBlock2Index[Pred->getID()]) + WorkList.push_back(Pred->getID()); + } + } + +#ifndef NDEBUG + // Check correctness of the ordering. + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + for (SIScheduleBlock* Pred : Block->getPreds()) { + assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] && + "Wrong Top Down topological sorting"); + } + } +#endif + + BottomUpIndex2Block = std::vector(TopDownIndex2Block.rbegin(), + TopDownIndex2Block.rend()); +} + +void SIScheduleBlockCreator::scheduleInsideBlocks() { + unsigned DAGSize = CurrentBlocks.size(); + + DEBUG(dbgs() << "\nScheduling Blocks\n\n"); + + // We do schedule a valid scheduling such that a Block corresponds + // to a range of instructions. + DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n"); + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + Block->fastSchedule(); + } + + // Note: the following code, and the part restoring previous position + // is by far the most expensive operation of the Scheduler. + + // Do not update CurrentTop. + MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop(); + std::vector PosOld; + std::vector PosNew; + PosOld.reserve(DAG->SUnits.size()); + PosNew.reserve(DAG->SUnits.size()); + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + int BlockIndice = TopDownIndex2Block[i]; + SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; + std::vector SUs = Block->getScheduledUnits(); + + for (SUnit* SU : SUs) { + MachineInstr *MI = SU->getInstr(); + MachineBasicBlock::iterator Pos = MI; + PosOld.push_back(Pos); + if (&*CurrentTopFastSched == MI) { + PosNew.push_back(Pos); + CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched, + DAG->getCurrentBottom()); + } else { + // Update the instruction stream. + DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI); + + // Update LiveIntervals. + // Note: Moving all instructions and calling handleMove everytime + // is the most cpu intensive operation of the scheduler. + // It would gain a lot if there was a way to recompute the + // LiveIntervals for the entire scheduling region. + DAG->getLIS()->handleMove(MI, /*UpdateFlags=*/true); + PosNew.push_back(CurrentTopFastSched); + } + } + } + + // Now we have Block of SUs == Block of MI. + // We do the final schedule for the instructions inside the block. + // The property that all the SUs of the Block are grouped together as MI + // is used for correct reg usage tracking. + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + std::vector SUs = Block->getScheduledUnits(); + Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr()); + } + + DEBUG(dbgs() << "Restoring MI Pos\n"); + // Restore old ordering (which prevents a LIS->handleMove bug). + for (unsigned i = PosOld.size(), e = 0; i != e; --i) { + MachineBasicBlock::iterator POld = PosOld[i-1]; + MachineBasicBlock::iterator PNew = PosNew[i-1]; + if (PNew != POld) { + // Update the instruction stream. + DAG->getBB()->splice(POld, DAG->getBB(), PNew); + + // Update LiveIntervals. + DAG->getLIS()->handleMove(POld, /*UpdateFlags=*/true); + } + } + + DEBUG( + for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + Block->printDebug(true); + } + ); +} + +void SIScheduleBlockCreator::fillStats() { + unsigned DAGSize = CurrentBlocks.size(); + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + int BlockIndice = TopDownIndex2Block[i]; + SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; + if (Block->getPreds().size() == 0) + Block->Depth = 0; + else { + unsigned Depth = 0; + for (SIScheduleBlock *Pred : Block->getPreds()) { + if (Depth < Pred->Depth + 1) + Depth = Pred->Depth + 1; + } + Block->Depth = Depth; + } + } + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + int BlockIndice = BottomUpIndex2Block[i]; + SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; + if (Block->getSuccs().size() == 0) + Block->Height = 0; + else { + unsigned Height = 0; + for (SIScheduleBlock *Succ : Block->getSuccs()) { + if (Height < Succ->Height + 1) + Height = Succ->Height + 1; + } + Block->Height = Height; + } + } +} + +// SIScheduleBlockScheduler // + +SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, + SISchedulerBlockSchedulerVariant Variant, + SIScheduleBlocks BlocksStruct) : + DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks), + LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0), + SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) { + + // Fill the usage of every output + // Warning: while by construction we always have a link between two blocks + // when one needs a result from the other, the number of users of an output + // is not the sum of child blocks having as input the same virtual register. + // Here is an example. A produces x and y. B eats x and produces x'. + // C eats x' and y. The register coalescer may have attributed the same + // virtual register to x and x'. + // To count accurately, we do a topological sort. In case the register is + // found for several parents, we increment the usage of the one with the + // highest topological index. + LiveOutRegsNumUsages.resize(Blocks.size()); + for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { + SIScheduleBlock *Block = Blocks[i]; + for (unsigned Reg : Block->getInRegs()) { + bool Found = false; + int topoInd = -1; + for (SIScheduleBlock* Pred: Block->getPreds()) { + std::set PredOutRegs = Pred->getOutRegs(); + std::set::iterator RegPos = PredOutRegs.find(Reg); + + if (RegPos != PredOutRegs.end()) { + Found = true; + if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) { + topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()]; + } + } + } + + if (!Found) + continue; + + int PredID = BlocksStruct.TopDownIndex2Block[topoInd]; + std::map::iterator RegPos = + LiveOutRegsNumUsages[PredID].find(Reg); + if (RegPos != LiveOutRegsNumUsages[PredID].end()) { + ++LiveOutRegsNumUsages[PredID][Reg]; + } else { + LiveOutRegsNumUsages[PredID][Reg] = 1; + } + } + } + + LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0); + BlockNumPredsLeft.resize(Blocks.size()); + BlockNumSuccsLeft.resize(Blocks.size()); + + for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { + SIScheduleBlock *Block = Blocks[i]; + BlockNumPredsLeft[i] = Block->getPreds().size(); + BlockNumSuccsLeft[i] = Block->getSuccs().size(); + } + +#ifndef NDEBUG + for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { + SIScheduleBlock *Block = Blocks[i]; + assert(Block->getID() == i); + } +#endif + + std::set InRegs = DAG->getInRegs(); + addLiveRegs(InRegs); + + // Fill LiveRegsConsumers for regs that were already + // defined before scheduling. + for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { + SIScheduleBlock *Block = Blocks[i]; + for (unsigned Reg : Block->getInRegs()) { + bool Found = false; + for (SIScheduleBlock* Pred: Block->getPreds()) { + std::set PredOutRegs = Pred->getOutRegs(); + std::set::iterator RegPos = PredOutRegs.find(Reg); + + if (RegPos != PredOutRegs.end()) { + Found = true; + break; + } + } + + if (!Found) { + if (LiveRegsConsumers.find(Reg) == LiveRegsConsumers.end()) + LiveRegsConsumers[Reg] = 1; + else + ++LiveRegsConsumers[Reg]; + } + } + } + + for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { + SIScheduleBlock *Block = Blocks[i]; + if (BlockNumPredsLeft[i] == 0) { + ReadyBlocks.push_back(Block); + } + } + + while (SIScheduleBlock *Block = pickBlock()) { + BlocksScheduled.push_back(Block); + blockScheduled(Block); + } + + DEBUG( + dbgs() << "Block Order:"; + for (SIScheduleBlock* Block : BlocksScheduled) { + dbgs() << ' ' << Block->getID(); + } + ); +} + +bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand, + SIBlockSchedCandidate &TryCand) { + if (!Cand.isValid()) { + TryCand.Reason = NodeOrder; + return true; + } + + // Try to hide high latencies. + if (tryLess(TryCand.LastPosHighLatParentScheduled, + Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency)) + return true; + // Schedule high latencies early so you can hide them better. + if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency, + TryCand, Cand, Latency)) + return true; + if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height, + TryCand, Cand, Depth)) + return true; + if (tryGreater(TryCand.NumHighLatencySuccessors, + Cand.NumHighLatencySuccessors, + TryCand, Cand, Successor)) + return true; + return false; +} + +bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand, + SIBlockSchedCandidate &TryCand) { + if (!Cand.isValid()) { + TryCand.Reason = NodeOrder; + return true; + } + + if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0, + TryCand, Cand, RegUsage)) + return true; + if (tryGreater(TryCand.NumSuccessors > 0, + Cand.NumSuccessors > 0, + TryCand, Cand, Successor)) + return true; + if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth)) + return true; + if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff, + TryCand, Cand, RegUsage)) + return true; + return false; +} + +SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { + SIBlockSchedCandidate Cand; + std::vector::iterator Best; + SIScheduleBlock *Block; + if (ReadyBlocks.empty()) + return nullptr; + + DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(), + VregCurrentUsage, SregCurrentUsage); + if (VregCurrentUsage > maxVregUsage) + maxVregUsage = VregCurrentUsage; + if (VregCurrentUsage > maxSregUsage) + maxSregUsage = VregCurrentUsage; + DEBUG( + dbgs() << "Picking New Blocks\n"; + dbgs() << "Available: "; + for (SIScheduleBlock* Block : ReadyBlocks) + dbgs() << Block->getID() << ' '; + dbgs() << "\nCurrent Live:\n"; + for (unsigned Reg : LiveRegs) + dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' '; + dbgs() << '\n'; + dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; + dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n'; + ); + + Cand.Block = nullptr; + for (std::vector::iterator I = ReadyBlocks.begin(), + E = ReadyBlocks.end(); I != E; ++I) { + SIBlockSchedCandidate TryCand; + TryCand.Block = *I; + TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock(); + TryCand.VGPRUsageDiff = + checkRegUsageImpact(TryCand.Block->getInRegs(), + TryCand.Block->getOutRegs())[DAG->getVGPRSetID()]; + TryCand.NumSuccessors = TryCand.Block->getSuccs().size(); + TryCand.NumHighLatencySuccessors = + TryCand.Block->getNumHighLatencySuccessors(); + TryCand.LastPosHighLatParentScheduled = + (unsigned int) std::max (0, + LastPosHighLatencyParentScheduled[TryCand.Block->getID()] - + LastPosWaitedHighLatency); + TryCand.Height = TryCand.Block->Height; + // Try not to increase VGPR usage too much, else we may spill. + if (VregCurrentUsage > 120 || + Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) { + if (!tryCandidateRegUsage(Cand, TryCand) && + Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage) + tryCandidateLatency(Cand, TryCand); + } else { + if (!tryCandidateLatency(Cand, TryCand)) + tryCandidateRegUsage(Cand, TryCand); + } + if (TryCand.Reason != NoCand) { + Cand.setBest(TryCand); + Best = I; + DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' ' + << getReasonStr(Cand.Reason) << '\n'); + } + } + + DEBUG( + dbgs() << "Picking: " << Cand.Block->getID() << '\n'; + dbgs() << "Is a block with high latency instruction: " + << (Cand.IsHighLatency ? "yes\n" : "no\n"); + dbgs() << "Position of last high latency dependency: " + << Cand.LastPosHighLatParentScheduled << '\n'; + dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n'; + dbgs() << '\n'; + ); + + Block = Cand.Block; + ReadyBlocks.erase(Best); + return Block; +} + +// Tracking of currently alive registers to determine VGPR Usage. + +void SIScheduleBlockScheduler::addLiveRegs(std::set &Regs) { + for (unsigned Reg : Regs) { + // For now only track virtual registers. + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + // If not already in the live set, then add it. + (void) LiveRegs.insert(Reg); + } +} + +void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block, + std::set &Regs) { + for (unsigned Reg : Regs) { + // For now only track virtual registers. + std::set::iterator Pos = LiveRegs.find(Reg); + assert (Pos != LiveRegs.end() && // Reg must be live. + LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() && + LiveRegsConsumers[Reg] >= 1); + --LiveRegsConsumers[Reg]; + if (LiveRegsConsumers[Reg] == 0) + LiveRegs.erase(Pos); + } +} + +void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) { + for (SIScheduleBlock* Block : Parent->getSuccs()) { + --BlockNumPredsLeft[Block->getID()]; + if (BlockNumPredsLeft[Block->getID()] == 0) { + ReadyBlocks.push_back(Block); + } + // TODO: Improve check. When the dependency between the high latency + // instructions and the instructions of the other blocks are WAR or WAW + // there will be no wait triggered. We would like these cases to not + // update LastPosHighLatencyParentScheduled. + if (Parent->isHighLatencyBlock()) + LastPosHighLatencyParentScheduled[Block->getID()] = NumBlockScheduled; + } +} + +void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) { + decreaseLiveRegs(Block, Block->getInRegs()); + addLiveRegs(Block->getOutRegs()); + releaseBlockSuccs(Block); + for (std::map::iterator RegI = + LiveOutRegsNumUsages[Block->getID()].begin(), + E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) { + std::pair RegP = *RegI; + if (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end()) + LiveRegsConsumers[RegP.first] = RegP.second; + else { + assert(LiveRegsConsumers[RegP.first] == 0); + LiveRegsConsumers[RegP.first] += RegP.second; + } + } + if (LastPosHighLatencyParentScheduled[Block->getID()] > + (unsigned)LastPosWaitedHighLatency) + LastPosWaitedHighLatency = + LastPosHighLatencyParentScheduled[Block->getID()]; + ++NumBlockScheduled; +} + +std::vector +SIScheduleBlockScheduler::checkRegUsageImpact(std::set &InRegs, + std::set &OutRegs) { + std::vector DiffSetPressure; + DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0); + + for (unsigned Reg : InRegs) { + // For now only track virtual registers. + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + if (LiveRegsConsumers[Reg] > 1) + continue; + PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); + for (; PSetI.isValid(); ++PSetI) { + DiffSetPressure[*PSetI] -= PSetI.getWeight(); + } + } + + for (unsigned Reg : OutRegs) { + // For now only track virtual registers. + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); + for (; PSetI.isValid(); ++PSetI) { + DiffSetPressure[*PSetI] += PSetI.getWeight(); + } + } + + return DiffSetPressure; +} + +// SIScheduler // + +struct SIScheduleBlockResult +SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, + SISchedulerBlockSchedulerVariant ScheduleVariant) { + SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant); + SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks); + std::vector ScheduledBlocks; + struct SIScheduleBlockResult Res; + + ScheduledBlocks = Scheduler.getBlocks(); + + for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) { + SIScheduleBlock *Block = ScheduledBlocks[b]; + std::vector SUs = Block->getScheduledUnits(); + + for (SUnit* SU : SUs) + Res.SUs.push_back(SU->NodeNum); + } + + Res.MaxSGPRUsage = Scheduler.getSGPRUsage(); + Res.MaxVGPRUsage = Scheduler.getVGPRUsage(); + return Res; +} + +// SIScheduleDAGMI // + +SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) : + ScheduleDAGMILive(C, make_unique(C)) { + SITII = static_cast(TII); + SITRI = static_cast(TRI); + + VGPRSetID = SITRI->getVGPR32PressureSet(); + SGPRSetID = SITRI->getSGPR32PressureSet(); +} + +SIScheduleDAGMI::~SIScheduleDAGMI() { +} + +ScheduleDAGInstrs *llvm::createSIMachineScheduler(MachineSchedContext *C) { + return new SIScheduleDAGMI(C); +} + +// Code adapted from scheduleDAG.cpp +// Does a topological sort over the SUs. +// Both TopDown and BottomUp +void SIScheduleDAGMI::topologicalSort() { + std::vector TopDownSU2Index; + unsigned DAGSize = SUnits.size(); + std::vector WorkList; + + DEBUG(dbgs() << "Topological Sort\n"); + WorkList.reserve(DAGSize); + + TopDownIndex2SU.resize(DAGSize); + TopDownSU2Index.resize(DAGSize); + BottomUpIndex2SU.resize(DAGSize); + + WorkList.push_back(&getExitSU()); + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &SUnits[i]; + int NodeNum = SU->NodeNum; + unsigned Degree = SU->Succs.size(); + TopDownSU2Index[NodeNum] = Degree; + if (Degree == 0) { + assert(SU->Succs.empty() && "SUnit should have no successors"); + WorkList.push_back(SU); + } + } + + int Id = DAGSize; + while (!WorkList.empty()) { + SUnit *SU = WorkList.back(); + WorkList.pop_back(); + if (SU->NodeNum < DAGSize) { + TopDownSU2Index[SU->NodeNum] = --Id; + TopDownIndex2SU[Id] = SU->NodeNum; + } + for (SDep& Pred : SU->Preds) { + SUnit *SU = Pred.getSUnit(); + if (SU->NodeNum < DAGSize && !--TopDownSU2Index[SU->NodeNum]) + WorkList.push_back(SU); + } + } + + BottomUpIndex2SU = std::vector(TopDownIndex2SU.rbegin(), + TopDownIndex2SU.rend()); + +#ifndef NDEBUG + // Check correctness of the ordering + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &SUnits[i]; + for (SDep& Pred : SU->Preds) { + if (Pred.getSUnit()->NodeNum >= DAGSize) + continue; + assert(TopDownSU2Index[SU->NodeNum] > + TopDownSU2Index[Pred.getSUnit()->NodeNum] && + "Wrong Top Down topological sorting"); + } + } + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &SUnits[i]; + for (SDep& Succ : SU->Succs) { + if (Succ.getSUnit()->NodeNum >= DAGSize) + continue; + assert(TopDownSU2Index[SU->NodeNum] < + TopDownSU2Index[Succ.getSUnit()->NodeNum] && + "Wrong Bottom Up topological sorting"); + } + } +#endif +} + +// Move low latencies further from their user without +// increasing SGPR usage (in general) +// This is to be replaced by a better pass that would +// take into account SGPR usage (based on VGPR Usage +// and the corresponding wavefront count), that would +// try to merge groups of loads if it make sense, etc +void SIScheduleDAGMI::moveLowLatencies() { + unsigned DAGSize = SUnits.size(); + int LastLowLatencyUser = -1; + int LastLowLatencyPos = -1; + + for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) { + SUnit *SU = &SUnits[ScheduledSUnits[i]]; + bool IsLowLatencyUser = false; + unsigned MinPos = 0; + + for (SDep& PredDep : SU->Preds) { + SUnit *Pred = PredDep.getSUnit(); + if (SITII->isLowLatencyInstruction(Pred->getInstr())) { + IsLowLatencyUser = true; + } + if (Pred->NodeNum >= DAGSize) + continue; + unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum]; + if (PredPos >= MinPos) + MinPos = PredPos + 1; + } + + if (SITII->isLowLatencyInstruction(SU->getInstr())) { + unsigned BestPos = LastLowLatencyUser + 1; + if ((int)BestPos <= LastLowLatencyPos) + BestPos = LastLowLatencyPos + 1; + if (BestPos < MinPos) + BestPos = MinPos; + if (BestPos < i) { + for (unsigned u = i; u > BestPos; --u) { + ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; + ScheduledSUnits[u] = ScheduledSUnits[u-1]; + } + ScheduledSUnits[BestPos] = SU->NodeNum; + ScheduledSUnitsInv[SU->NodeNum] = BestPos; + } + LastLowLatencyPos = BestPos; + if (IsLowLatencyUser) + LastLowLatencyUser = BestPos; + } else if (IsLowLatencyUser) { + LastLowLatencyUser = i; + // Moves COPY instructions on which depends + // the low latency instructions too. + } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) { + bool CopyForLowLat = false; + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SITII->isLowLatencyInstruction(Succ->getInstr())) { + CopyForLowLat = true; + } + } + if (!CopyForLowLat) + continue; + if (MinPos < i) { + for (unsigned u = i; u > MinPos; --u) { + ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; + ScheduledSUnits[u] = ScheduledSUnits[u-1]; + } + ScheduledSUnits[MinPos] = SU->NodeNum; + ScheduledSUnitsInv[SU->NodeNum] = MinPos; + } + } + } +} + +void SIScheduleDAGMI::restoreSULinksLeft() { + for (unsigned i = 0, e = SUnits.size(); i != e; ++i) { + SUnits[i].isScheduled = false; + SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft; + SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft; + SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft; + SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft; + } +} + +// Return the Vgpr and Sgpr usage corresponding to some virtual registers. +template void +SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End, + unsigned &VgprUsage, unsigned &SgprUsage) { + VgprUsage = 0; + SgprUsage = 0; + for (_Iterator RegI = First; RegI != End; ++RegI) { + unsigned Reg = *RegI; + // For now only track virtual registers + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + PSetIterator PSetI = MRI.getPressureSets(Reg); + for (; PSetI.isValid(); ++PSetI) { + if (*PSetI == VGPRSetID) + VgprUsage += PSetI.getWeight(); + else if (*PSetI == SGPRSetID) + SgprUsage += PSetI.getWeight(); + } + } +} + +void SIScheduleDAGMI::schedule() +{ + SmallVector TopRoots, BotRoots; + SIScheduleBlockResult Best, Temp; + DEBUG(dbgs() << "Preparing Scheduling\n"); + + buildDAGWithRegPressure(); + DEBUG( + for(SUnit& SU : SUnits) + SU.dumpAll(this) + ); + + Topo.InitDAGTopologicalSorting(); + topologicalSort(); + findRootsAndBiasEdges(TopRoots, BotRoots); + // We reuse several ScheduleDAGMI and ScheduleDAGMILive + // functions, but to make them happy we must initialize + // the default Scheduler implementation (even if we do not + // run it) + SchedImpl->initialize(this); + initQueues(TopRoots, BotRoots); + + // Fill some stats to help scheduling. + + SUnitsLinksBackup = SUnits; + IsLowLatencySU.clear(); + LowLatencyOffset.clear(); + IsHighLatencySU.clear(); + + IsLowLatencySU.resize(SUnits.size(), 0); + LowLatencyOffset.resize(SUnits.size(), 0); + IsHighLatencySU.resize(SUnits.size(), 0); + + for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { + SUnit *SU = &SUnits[i]; + unsigned BaseLatReg, OffLatReg; + if (SITII->isLowLatencyInstruction(SU->getInstr())) { + IsLowLatencySU[i] = 1; + if (SITII->getMemOpBaseRegImmOfs(SU->getInstr(), BaseLatReg, + OffLatReg, TRI)) + LowLatencyOffset[i] = OffLatReg; + } else if (SITII->isHighLatencyInstruction(SU->getInstr())) + IsHighLatencySU[i] = 1; + } + + SIScheduler Scheduler(this); + Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone, + SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage); +#if 0 // To enable when handleMove fix lands + // if VGPR usage is extremely high, try other good performing variants + // which could lead to lower VGPR usage + if (Best.MaxVGPRUsage > 180) { + std::vector> Variants = { + { LatenciesAlone, BlockRegUsageLatency }, +// { LatenciesAlone, BlockRegUsage }, + { LatenciesGrouped, BlockLatencyRegUsage }, +// { LatenciesGrouped, BlockRegUsageLatency }, +// { LatenciesGrouped, BlockRegUsage }, + { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, +// { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, +// { LatenciesAlonePlusConsecutive, BlockRegUsage } + }; + for (std::pair v : Variants) { + Temp = Scheduler.scheduleVariant(v.first, v.second); + if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) + Best = Temp; + } + } + // if VGPR usage is still extremely high, we may spill. Try other variants + // which are less performing, but that could lead to lower VGPR usage. + if (Best.MaxVGPRUsage > 200) { + std::vector> Variants = { +// { LatenciesAlone, BlockRegUsageLatency }, + { LatenciesAlone, BlockRegUsage }, +// { LatenciesGrouped, BlockLatencyRegUsage }, + { LatenciesGrouped, BlockRegUsageLatency }, + { LatenciesGrouped, BlockRegUsage }, +// { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, + { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, + { LatenciesAlonePlusConsecutive, BlockRegUsage } + }; + for (std::pair v : Variants) { + Temp = Scheduler.scheduleVariant(v.first, v.second); + if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) + Best = Temp; + } + } +#endif + ScheduledSUnits = Best.SUs; + ScheduledSUnitsInv.resize(SUnits.size()); + + for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { + ScheduledSUnitsInv[ScheduledSUnits[i]] = i; + } + + moveLowLatencies(); + + // Tell the outside world about the result of the scheduling. + + assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker"); + TopRPTracker.setPos(CurrentTop); + + for (std::vector::iterator I = ScheduledSUnits.begin(), + E = ScheduledSUnits.end(); I != E; ++I) { + SUnit *SU = &SUnits[*I]; + + scheduleMI(SU, true); + + DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " + << *SU->getInstr()); + } + + assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone."); + + placeDebugValues(); + + DEBUG({ + unsigned BBNum = begin()->getParent()->getNumber(); + dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n"; + dumpSchedule(); + dbgs() << '\n'; + }); +} Index: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h +++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h @@ -25,6 +25,9 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { private: + unsigned SGPR32SetID; + unsigned VGPR32SetID; + void reserveRegisterTuples(BitVector &, unsigned Reg) const; public: @@ -146,6 +149,9 @@ unsigned findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC) const; + unsigned getSGPR32PressureSet() const { return SGPR32SetID; }; + unsigned getVGPR32PressureSet() const { return VGPR32SetID; }; + private: void buildScratchLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp, unsigned Value, Index: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -23,7 +23,20 @@ using namespace llvm; -SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() {} +SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() { + unsigned NumRegPressureSets = getNumRegPressureSets(); + + SGPR32SetID = NumRegPressureSets; + VGPR32SetID = NumRegPressureSets; + for (unsigned i = 0; i < NumRegPressureSets; ++i) { + if (strncmp("SGPR_32", getRegPressureSetName(i), 7) == 0) + SGPR32SetID = i; + else if (strncmp("VGPR_32", getRegPressureSetName(i), 7) == 0) + VGPR32SetID = i; + } + assert(SGPR32SetID < NumRegPressureSets && + VGPR32SetID < NumRegPressureSets); +} void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const { MCRegAliasIterator R(Reg, this, true); Index: llvm/trunk/test/CodeGen/AMDGPU/si-scheduler.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/si-scheduler.ll +++ llvm/trunk/test/CodeGen/AMDGPU/si-scheduler.ll @@ -0,0 +1,55 @@ +; RUN: llc -march=amdgcn -mcpu=SI --misched=si < %s | FileCheck %s + +; The test checks the "si" machine scheduler pass works correctly. + +; CHECK-LABEL: {{^}}main: +; CHECK: s_wqm +; CHECK: s_load_dwordx4 +; CHECK: s_load_dwordx8 +; CHECK: s_waitcnt lgkmcnt(0) +; CHECK: image_sample +; CHECK: s_waitcnt vmcnt(0) +; CHECK: exp +; CHECK: s_endpgm + +define void @main([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, +<2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 { +main_body: + %22 = bitcast [34 x <8 x i32>] addrspace(2)* %3 to <32 x i8> addrspace(2)* + %23 = load <32 x i8>, <32 x i8> addrspace(2)* %22, align 32, !tbaa !0 + %24 = bitcast [17 x <4 x i32>] addrspace(2)* %2 to <16 x i8> addrspace(2)* + %25 = load <16 x i8>, <16 x i8> addrspace(2)* %24, align 16, !tbaa !0 + %26 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %5, <2 x i32> %11) + %27 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %11) + %28 = bitcast float %26 to i32 + %29 = bitcast float %27 to i32 + %30 = insertelement <2 x i32> undef, i32 %28, i32 0 + %31 = insertelement <2 x i32> %30, i32 %29, i32 1 + %32 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %31, <32 x i8> %23, <16 x i8> %25, i32 2) + %33 = extractelement <4 x float> %32, i32 0 + %34 = extractelement <4 x float> %32, i32 1 + %35 = extractelement <4 x float> %32, i32 2 + %36 = extractelement <4 x float> %32, i32 3 + %37 = call i32 @llvm.SI.packf16(float %33, float %34) + %38 = bitcast i32 %37 to float + %39 = call i32 @llvm.SI.packf16(float %35, float %36) + %40 = bitcast i32 %39 to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %38, float %40, float %38, float %40) + ret void +} + +; Function Attrs: nounwind readnone +declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.SI.packf16(float, float) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" } +attributes #1 = { nounwind readnone } + +!0 = !{!"const", null, i32 1}