diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h --- a/llvm/include/llvm/CodeGen/MachineScheduler.h +++ b/llvm/include/llvm/CodeGen/MachineScheduler.h @@ -91,6 +91,7 @@ #include "llvm/Support/ErrorHandling.h" #include #include +#include #include #include #include @@ -109,6 +110,7 @@ class MachineLoopInfo; class RegisterClassInfo; class SchedDFSResult; +class SchedHeuristic; class ScheduleHazardRecognizer; class TargetInstrInfo; class TargetPassConfig; @@ -219,6 +221,10 @@ /// This has to be enabled in combination with shouldTrackPressure(). virtual bool shouldTrackLaneMasks() const { return false; } + /// Returns true if disable heuristic that tries to fetch nodes from long + /// dependency chains first. + virtual bool disableLatencyHeuristic() const { return false; } + // If this method returns true, handling of the scheduling regions // themselves (in case of a scheduling boundary in MBB) will be done // beginning with the topmost region of MBB. @@ -237,6 +243,10 @@ /// that depend on EntrySU or ExitSU). virtual void registerRoots() {} + /// Register the Heuristic before \p Before. + virtual void registerHeuristic(SchedHeuristic &Heuristic, + SchedHeuristic *Before) {} + /// Pick the next node to schedule, or return NULL. Set IsTopNode to true to /// schedule the node at the top of the unscheduled region. Otherwise it will /// be scheduled at the bottom. @@ -355,6 +365,18 @@ /// instances of ScheduleDAGMI to perform custom DAG postprocessing. void postprocessDAG(); + /// Register all the heuristics for the scheduler. The earlier it is + /// registered, the higher priority it has. + virtual void registerHeuristics(); + + /// Wrapper to register heuristic. + void registerHeuristic(SchedHeuristic *Heuristic, + SchedHeuristic *Before = nullptr) { + if (!Heuristic) + return; + SchedImpl->registerHeuristic(*Heuristic, Before); + } + /// Release ExitSU predecessors and setup scheduler queues. void initQueues(ArrayRef TopRoots, ArrayRef BotRoots); @@ -406,6 +428,9 @@ IntervalPressure RegPressure; RegPressureTracker RPTracker; + /// Disable the heuristic for long dependency chain in this region. + bool DisableLatencyHeuristic = false; + /// List of pressure sets that exceed the target's pressure limit before /// scheduling, listed in increasing set ID order. Each pressure set is paired /// with its max pressure in the currently scheduled regions. @@ -438,6 +463,9 @@ /// Return true if register pressure tracking is enabled. bool isTrackingPressure() const { return ShouldTrackPressure; } + /// Return true if latency heuristic is disbaled. + bool disableLatencyHeuristic() const { return DisableLatencyHeuristic; } + /// Get current register pressure for the top scheduled instructions. const IntervalPressure &getTopPressure() const { return TopPressure; } const RegPressureTracker &getTopRPTracker() const { return TopRPTracker; } @@ -502,6 +530,8 @@ /// Move an instruction and update register pressure. void scheduleMI(SUnit *SU, bool IsTopNode); + void registerHeuristics() override; + // Lesser helpers... void initRegPressure(); @@ -796,17 +826,6 @@ /// heuristics for either preRA or postRA scheduling. class GenericSchedulerBase : public MachineSchedStrategy { public: - /// Represent the type of SchedCandidate found within a single queue. - /// pickNodeBidirectional depends on these listed by decreasing priority. - enum CandReason : uint8_t { - NoCand, Only1, PhysReg, RegExcess, RegCritical, Stall, Cluster, Weak, - RegMax, ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce, - TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder}; - -#ifndef NDEBUG - static const char *getReasonStr(GenericSchedulerBase::CandReason Reason); -#endif - /// Policy for scheduling the next instruction in the candidate's zone. struct CandPolicy { bool ReduceLatency = false; @@ -852,8 +871,8 @@ // The best SUnit candidate. SUnit *SU; - // The reason for this candidate. - CandReason Reason; + // The heuristic reason for this candidate. + SchedHeuristic *Reason = nullptr; // Whether this candidate should be scheduled at top/bottom. bool AtTop; @@ -870,7 +889,7 @@ void reset(const CandPolicy &NewPolicy) { Policy = NewPolicy; SU = nullptr; - Reason = NoCand; + Reason = nullptr; AtTop = false; RPDelta = RegPressureDelta(); ResDelta = SchedResourceDelta(); @@ -880,7 +899,7 @@ // Copy the status of another candidate without changing policy. void setBest(SchedCandidate &Best) { - assert(Best.Reason != NoCand && "uninitialized Sched candidate"); + assert(Best.Reason && "uninitialized Sched candidate"); SU = Best.SU; Reason = Best.Reason; AtTop = Best.AtTop; @@ -892,6 +911,15 @@ const TargetSchedModel *SchedModel); }; + // Register the \p Heuristic before \p Before. If didn't specify the \Before, + // it will be registered before the last heuristic which is the NodeOrder. + void registerHeuristic(SchedHeuristic &Heuristic, + SchedHeuristic *Before) override; + + void initPolicy(MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + unsigned NumRegionInstrs) override; + protected: const MachineSchedContext *Context; const TargetSchedModel *SchedModel = nullptr; @@ -899,11 +927,21 @@ SchedRemainder Rem; + // Keep all the heuristics. Notice that, the earlier it is pushed into the + // vector, the higher priority it has. + SmallVector Heuristics; + GenericSchedulerBase(const MachineSchedContext *C) : Context(C) {} void setPolicy(CandPolicy &Policy, bool IsPostRA, SchedBoundary &CurrZone, SchedBoundary *OtherZone); + void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) const; + bool hasHigherPriority(const SchedHeuristic *Reason1, + const SchedHeuristic *Reason2) const; + void initialize(ScheduleDAGMI *DAG) override; + void schedNode(SUnit *SU, bool IsTopNode) override; #ifndef NDEBUG void traceCandidate(const SchedCandidate &Cand); #endif @@ -914,24 +952,16 @@ }; // Utility functions used by heuristics in tryCandidate(). -bool tryLess(int TryVal, int CandVal, - GenericSchedulerBase::SchedCandidate &TryCand, - GenericSchedulerBase::SchedCandidate &Cand, - GenericSchedulerBase::CandReason Reason); -bool tryGreater(int TryVal, int CandVal, - GenericSchedulerBase::SchedCandidate &TryCand, - GenericSchedulerBase::SchedCandidate &Cand, - GenericSchedulerBase::CandReason Reason); -bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand, - GenericSchedulerBase::SchedCandidate &Cand, - SchedBoundary &Zone); -bool tryPressure(const PressureChange &TryP, - const PressureChange &CandP, - GenericSchedulerBase::SchedCandidate &TryCand, - GenericSchedulerBase::SchedCandidate &Cand, - GenericSchedulerBase::CandReason Reason, - const TargetRegisterInfo *TRI, - const MachineFunction &MF); +GenericSchedulerBase::SchedCandidate * +tryLess(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, + GenericSchedulerBase::SchedCandidate &Cand); + +GenericSchedulerBase::SchedCandidate * +tryPressure(const PressureChange &TryP, const PressureChange &CandP, + GenericSchedulerBase::SchedCandidate &TryCand, + GenericSchedulerBase::SchedCandidate &Cand, + const TargetRegisterInfo *TRI, const MachineFunction &MF); + unsigned getWeakLeft(const SUnit *SU, bool isTop); int biasPhysReg(const SUnit *SU, bool isTop); @@ -957,6 +987,10 @@ return RegionPolicy.ShouldTrackLaneMasks; } + bool disableLatencyHeuristic() const override { + return RegionPolicy.DisableLatencyHeuristic; + } + void initialize(ScheduleDAGMI *dag) override; SUnit *pickNode(bool &IsTopNode) override; @@ -1001,9 +1035,6 @@ const RegPressureTracker &RPTracker, RegPressureTracker &TempTracker); - virtual void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, - SchedBoundary *Zone) const; - SUnit *pickNodeBidirectional(bool &IsTopNode); void pickNodeFromQueue(SchedBoundary &Zone, @@ -1031,12 +1062,6 @@ ~PostGenericScheduler() override = default; - void initPolicy(MachineBasicBlock::iterator Begin, - MachineBasicBlock::iterator End, - unsigned NumRegionInstrs) override { - /* no configurable policy */ - } - /// PostRA scheduling does not track pressure. bool shouldTrackPressure() const override { return false; } @@ -1064,7 +1089,6 @@ } protected: - virtual void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand); void pickNodeFromQueue(SchedCandidate &Cand); }; @@ -1089,6 +1113,78 @@ createCopyConstrainDAGMutation(const TargetInstrInfo *TII, const TargetRegisterInfo *TRI); +//===----------------------------------------------------------------------===// +// Predefined scheduling heuristics +//===----------------------------------------------------------------------===// +using SchedCandidate = GenericSchedulerBase::SchedCandidate; +class SchedHeuristic { +public: + friend class GenericSchedulerBase; + using TryFn = std::function; + using InitFn = std::function; + using UpdateFn = std::function; + using TraceFn = std::function; + + SchedHeuristic(StringRef Name) : Name(Name) {} + + // Return either Can or TryCand if it wins. Otherwise, return nullptr. + SchedCandidate *tryCandidate(SchedCandidate &Can, SchedCandidate &TryCand, + SchedBoundary *Zone) { + assert(TryCandidate && "Missing TryCandidate for heuristic"); + return TryCandidate(Can, TryCand, Zone); + } + // Notify the heuristic to update the internal state after SU is scheduled. + void update(SUnit &SU, bool IsTop) { + if (Update) + Update(SU, IsTop); + } + // Initialize the heuristic whenever entering region + void initialize(ScheduleDAGMI *DAG) { + if (Initialize) + Initialize(DAG); + } + + void installTryFn(TryFn Fn) { TryCandidate = Fn; } + void installInitFn(InitFn Fn) { Initialize = Fn; } + void installUpdateFn(UpdateFn Fn) { Update = Fn; } +#ifndef NDEBUG + void installTraceFn(TraceFn Fn) { Trace = Fn; } + void trace(const SchedCandidate &Can) { + if (Trace) + Trace(Can); + } +#endif + StringRef name() const { return Name; } + +protected: + // Heuristic name + StringRef Name; + // The internal priority. + int Priority = -1; + TryFn TryCandidate = nullptr; + UpdateFn Update = nullptr; + InitFn Initialize = nullptr; +#ifndef NDEBUG + TraceFn Trace = nullptr; +#endif +}; + +SchedHeuristic *createPhysRegHeuristic(); +SchedHeuristic *createNodeOrderHeuristic(); +SchedHeuristic *createWeakHeuristic(); +SchedHeuristic *createStallHeuristic(); +SchedHeuristic *createClusterHeuristic(ScheduleDAGMI *DAG); +SchedHeuristic *createTopDepthReduceHeuristic(bool PreRA); +SchedHeuristic *createTopPathReduceHeuristic(bool PreRA); +SchedHeuristic *createBotHeightReduceHeuristic(bool PreRA); +SchedHeuristic *createBotPathReduceHeuristic(bool PreRA); +SchedHeuristic *createResourceReduceHeuristic(ScheduleDAGMI *DAG, bool PreRA); +SchedHeuristic *createResourceDemandHeuristic(ScheduleDAGMI *DAG); +SchedHeuristic *createRegExcessHeuristic(ScheduleDAGMILive *DAG); +SchedHeuristic *createRegCritMaxHeuristic(ScheduleDAGMILive *DAG); +SchedHeuristic *createRegMaxHeuristic(ScheduleDAGMILive *DAG); + } // end namespace llvm #endif // LLVM_CODEGEN_MACHINESCHEDULER_H diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -700,6 +700,17 @@ ScheduleDAGInstrs::finishBlock(); } +void ScheduleDAGMI::registerHeuristics() { + registerHeuristic(createStallHeuristic()); + registerHeuristic(createClusterHeuristic(this)); + registerHeuristic(createResourceReduceHeuristic(this, false)); + registerHeuristic(createResourceDemandHeuristic(this)); + registerHeuristic(createTopDepthReduceHeuristic(false)); + registerHeuristic(createTopPathReduceHeuristic(false)); + registerHeuristic(createBotHeightReduceHeuristic(false)); + registerHeuristic(createBotPathReduceHeuristic(false)); +} + /// enterRegion - Called back from MachineScheduler::runOnMachineFunction after /// crossing a scheduling boundary. [begin, end) includes all instructions in /// the region, including the boundary itself and single-instruction regions @@ -758,6 +769,8 @@ postprocessDAG(); + registerHeuristics(); + SmallVector TopRoots, BotRoots; findRootsAndBiasEdges(TopRoots, BotRoots); @@ -993,6 +1006,36 @@ "ShouldTrackLaneMasks requires ShouldTrackPressure"); } +void ScheduleDAGMILive::registerHeuristics() { + registerHeuristic(createPhysRegHeuristic()); + + if (isTrackingPressure()) { + registerHeuristic(createRegExcessHeuristic(this)); + registerHeuristic(createRegCritMaxHeuristic(this)); + } + + registerHeuristic(createTopDepthReduceHeuristic(true)); + registerHeuristic(createTopPathReduceHeuristic(true)); + registerHeuristic(createBotHeightReduceHeuristic(true)); + registerHeuristic(createBotPathReduceHeuristic(true)); + registerHeuristic(createStallHeuristic()); + registerHeuristic(createClusterHeuristic(this)); + registerHeuristic(createWeakHeuristic()); + + if (isTrackingPressure()) + registerHeuristic(createRegMaxHeuristic(this)); + + registerHeuristic(createResourceReduceHeuristic(this, true)); + registerHeuristic(createResourceDemandHeuristic(this)); + + if (!SchedImpl->disableLatencyHeuristic()) { + registerHeuristic(createTopDepthReduceHeuristic(false)); + registerHeuristic(createTopPathReduceHeuristic(false)); + registerHeuristic(createBotHeightReduceHeuristic(false)); + registerHeuristic(createBotPathReduceHeuristic(false)); + } +} + // Setup the register pressure trackers for the top scheduled and bottom // scheduled regions. void ScheduleDAGMILive::initRegPressure() { @@ -1206,6 +1249,8 @@ postprocessDAG(); + registerHeuristics(); + SmallVector TopRoots, BotRoots; findRootsAndBiasEdges(TopRoots, BotRoots); @@ -1462,7 +1507,353 @@ } } } +//===----------------------------------------------------------------------===// +// SchedHeuristic - Predefined scheduling heuristics +//===----------------------------------------------------------------------===// +namespace llvm { +// Bias PhysReg Defs and copies to their uses and defined respectively. +SchedHeuristic *createPhysRegHeuristic() { + static SchedHeuristic PhysReg("PYHS-REG"); + PhysReg.installTryFn([](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + return tryLess(biasPhysReg(TryCand.SU, TryCand.AtTop), + biasPhysReg(Cand.SU, Cand.AtTop), Cand, TryCand); + }); + return &PhysReg; +} + +// Fall through to original instruction order. +SchedHeuristic *createNodeOrderHeuristic() { + static SchedHeuristic NodeOrder("ORDER"); + NodeOrder.installTryFn([](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone) + return nullptr; + if (Zone->isTop()) + return tryLess(TryCand.SU->NodeNum, Cand.SU->NodeNum, TryCand, Cand); + return tryLess(TryCand.SU->NodeNum, Cand.SU->NodeNum, Cand, TryCand); + }); + return &NodeOrder; +} + +// Weak edges are for clustering and other constraints. +SchedHeuristic *createWeakHeuristic() { + static SchedHeuristic Weak("WEAK"); + Weak.installTryFn([](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone) + return nullptr; + return tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop), + getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand); + }); + return &Weak; +} + +// Prioritize instructions that read unbuffered resources by stall cycles. +SchedHeuristic *createStallHeuristic() { + static SchedHeuristic Stall("STALL"); + Stall.installTryFn([](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone) + return nullptr; + return tryLess(Zone->getLatencyStallCycles(TryCand.SU), + Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand); + }); + return &Stall; +} + +// Keep clustered nodes together to encourage downstream peephole +// optimizations which may reduce resource requirements. +// This is a best effort to set things up for a post-RA pass. +// Optimizations like generating loads of multiple registers should ideally +// be done within the scheduler pass by combining the loads during DAG +// postprocessing. +SchedHeuristic *createClusterHeuristic(ScheduleDAGMI *DAG) { + static SchedHeuristic Cluster("CLUSTER"); + Cluster.installTryFn([DAG](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + const SUnit *CandNextClusterSU = + Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); + const SUnit *TryCandNextClusterSU = + TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); + return tryLess(TryCand.SU == TryCandNextClusterSU, + Cand.SU == CandNextClusterSU, Cand, TryCand); + }); + return &Cluster; +} + +// For loops that are acyclic path limited, aggressively schedule for latency. +// Within an single cycle, whenever CurrMOps > 0, allow normal heuristics to +// take precedence. +// Prefer the candidate with the lesser depth, but only if one of them has +// depth greater than the total latency scheduled so far, otherwise either of +// them could be scheduled now with no stall. +SchedHeuristic *createTopDepthReduceHeuristic(bool PreRA) { + static SchedHeuristic TopDepthReducePreRA("TOP-DEPTH"), + TopDepthReducePostRA("TOP-DEPTH"); + auto PreRAFn = [](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone || !Zone->isTop()) + return nullptr; + if (!Zone->Rem->IsAcyclicLatencyLimited || Zone->getCurrMOps()) + return nullptr; + if (std::max(TryCand.SU->getDepth(), Cand.SU->getDepth()) <= + Zone->getScheduledLatency()) + return nullptr; + return tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(), TryCand, Cand); + }; + auto PostRAFn = [](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone || !Zone->isTop()) + return nullptr; + if (!TryCand.Policy.ReduceLatency || Zone->Rem->IsAcyclicLatencyLimited) + return nullptr; + if (std::max(TryCand.SU->getDepth(), Cand.SU->getDepth()) <= + Zone->getScheduledLatency()) + return nullptr; + return tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(), TryCand, Cand); + }; + TopDepthReducePreRA.installTryFn(PreRAFn); + TopDepthReducePostRA.installTryFn(PostRAFn); +#ifndef NDEBUG + auto Trace = [](const SchedCandidate &Cand) { + unsigned Latency = Cand.SU->getDepth(); + if (Latency) + dbgs() << Latency << " cycles "; + }; + TopDepthReducePreRA.installTraceFn(Trace); + TopDepthReducePostRA.installTraceFn(Trace); +#endif + return PreRA ? &TopDepthReducePreRA : &TopDepthReducePostRA; +} + +SchedHeuristic *createTopPathReduceHeuristic(bool PreRA) { + static SchedHeuristic TopPathReducePreRA("TOP-PATH"), + TopPathReducePostRA("TOP-PATH"); + auto PreRAFn = [](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone || !Zone->isTop()) + return nullptr; + if (!Zone->Rem->IsAcyclicLatencyLimited || Zone->getCurrMOps()) + return nullptr; + return tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(), Cand, + TryCand); + }; + auto PostRAFn = [](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone || !Zone->isTop()) + return nullptr; + if (!TryCand.Policy.ReduceLatency || Zone->Rem->IsAcyclicLatencyLimited) + return nullptr; + return tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(), Cand, + TryCand); + }; + + TopPathReducePreRA.installTryFn(PreRAFn); + TopPathReducePostRA.installTryFn(PostRAFn); +#ifndef NDEBUG + auto Trace = [](const SchedCandidate &Cand) { + unsigned Latency = Cand.SU->getHeight(); + if (Latency) + dbgs() << Latency << " cycles "; + }; + TopPathReducePreRA.installTraceFn(Trace); + TopPathReducePostRA.installTraceFn(Trace); +#endif + return PreRA ? &TopPathReducePreRA : &TopPathReducePostRA; +} + +// Prefer the candidate with the lesser height, but only if one of them +// has height greater than the total latency scheduled so far, otherwise +// either of them could be scheduled now with no stall. +SchedHeuristic *createBotHeightReduceHeuristic(bool PreRA) { + static SchedHeuristic BotHeightPreRA("BOT-HEIGHT"), + BotHeightPostRA("BOT-HEIGHT"); + auto PreRAFn = [](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone || Zone->isTop()) + return nullptr; + if (!Zone->Rem->IsAcyclicLatencyLimited || Zone->getCurrMOps()) + return nullptr; + if (std::max(TryCand.SU->getHeight(), Cand.SU->getHeight()) <= + Zone->getScheduledLatency()) + return nullptr; + return tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(), TryCand, + Cand); + }; + auto PostRAFn = [](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone || Zone->isTop()) + return nullptr; + if (!TryCand.Policy.ReduceLatency || Zone->Rem->IsAcyclicLatencyLimited) + return nullptr; + if (std::max(TryCand.SU->getHeight(), Cand.SU->getHeight()) <= + Zone->getScheduledLatency()) + return nullptr; + return tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(), TryCand, + Cand); + }; + + BotHeightPreRA.installTryFn(PreRAFn); + BotHeightPostRA.installTryFn(PostRAFn); +#ifndef NDEBUG + auto Trace = [](const SchedCandidate &Cand) { + unsigned Latency = Cand.SU->getHeight(); + if (Latency) + dbgs() << Latency << " cycles "; + }; + BotHeightPreRA.installTraceFn(Trace); + BotHeightPostRA.installTraceFn(Trace); +#endif + return PreRA ? &BotHeightPreRA : &BotHeightPostRA; +} + +SchedHeuristic *createBotPathReduceHeuristic(bool PreRA) { + static SchedHeuristic BotPathPreRA("BOT-PATH"), BotPathPostRA("BOT-PATH"); + auto PreRAFn = [](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone || Zone->isTop()) + return nullptr; + if (!Zone->Rem->IsAcyclicLatencyLimited || Zone->getCurrMOps()) + return nullptr; + return tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(), Cand, TryCand); + }; + auto PostRAFn = [](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone || Zone->isTop()) + return nullptr; + if (!TryCand.Policy.ReduceLatency || Zone->Rem->IsAcyclicLatencyLimited) + return nullptr; + return tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(), Cand, TryCand); + }; + + BotPathPreRA.installTryFn(PreRAFn); + BotPathPostRA.installTryFn(PostRAFn); +#ifndef NDEBUG + auto Trace = [](const SchedCandidate &Cand) { + unsigned Latency = Cand.SU->getDepth(); + if (Latency) + dbgs() << Latency << " cycles "; + }; + BotPathPreRA.installTraceFn(Trace); + BotPathPostRA.installTraceFn(Trace); +#endif + return PreRA ? &BotPathPreRA : &BotPathPostRA; +} + +// Avoid critical resource consumption and balance the schedule. +SchedHeuristic *createResourceReduceHeuristic(ScheduleDAGMI *DAG, bool PreRA) { + static SchedHeuristic ResReducePreRA("RES-REDUCE"), + ResReducePostRA("RES-REDUCE"); + auto PreRAFn = [DAG](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone) + return nullptr; + TryCand.initResourceDelta(DAG, Zone->SchedModel); + return tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, + TryCand, Cand); + }; + auto PostRAFn = [](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone) + return nullptr; + return tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, + TryCand, Cand); + }; + + ResReducePreRA.installTryFn(PreRAFn); + ResReducePostRA.installTryFn(PostRAFn); +#ifndef NDEBUG + auto Trace = [DAG](const SchedCandidate &Cand) { + unsigned ResIdx = Cand.Policy.ReduceResIdx; + if (ResIdx) + dbgs() << DAG->getSchedModel()->getProcResource(ResIdx)->Name; + }; + ResReducePreRA.installTraceFn(Trace); + ResReducePostRA.installTraceFn(Trace); +#endif + return PreRA ? &ResReducePreRA : &ResReducePostRA; +} + +SchedHeuristic *createResourceDemandHeuristic(ScheduleDAGMI *DAG) { + static SchedHeuristic ResDemand("RES-DEMAND"); + ResDemand.installTryFn([](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone) + return nullptr; + return tryLess(TryCand.ResDelta.DemandedResources, + Cand.ResDelta.DemandedResources, Cand, TryCand); + }); +#ifndef NDEBUG + ResDemand.installTraceFn([DAG](const SchedCandidate &Cand) { + unsigned ResIdx = Cand.Policy.DemandResIdx; + if (ResIdx) + dbgs() << DAG->getSchedModel()->getProcResource(ResIdx)->Name; + }); +#endif + return &ResDemand; +} + +// Avoid exceeding the target's limit. +SchedHeuristic *createRegExcessHeuristic(ScheduleDAGMILive *DAG) { + static SchedHeuristic RegExcess("REG-EXCESS"); + RegExcess.installTryFn([DAG](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + return tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, + Cand, DAG->TRI, DAG->MF); + }); +#ifndef NDEBUG + auto Trace = [DAG](const SchedCandidate &Cand) { + PressureChange P = Cand.RPDelta.Excess; + if (P.isValid()) + dbgs() << DAG->TRI->getRegPressureSetName(P.getPSet()) << ":" + << P.getUnitInc(); + }; + RegExcess.installTraceFn(Trace); +#endif + return &RegExcess; +} + +// Avoid increasing the max critical pressure in the scheduled region. +SchedHeuristic *createRegCritMaxHeuristic(ScheduleDAGMILive *DAG) { + static SchedHeuristic RegCrit("REG-CRIT"); + RegCrit.installTryFn([DAG](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + return tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, + TryCand, Cand, DAG->TRI, DAG->MF); + }); +#ifndef NDEBUG + RegCrit.installTraceFn([DAG](const SchedCandidate &Cand) { + PressureChange P = Cand.RPDelta.CriticalMax; + if (P.isValid()) + dbgs() << DAG->TRI->getRegPressureSetName(P.getPSet()) << ":" + << P.getUnitInc(); + }); +#endif + return &RegCrit; +} + +// Avoid increasing the max pressure of the entire region. +SchedHeuristic *createRegMaxHeuristic(ScheduleDAGMILive *DAG) { + static SchedHeuristic RegMax("REG-MAX"); + RegMax.installTryFn([DAG](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + return tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, + TryCand, Cand, DAG->TRI, DAG->MF); + }); +#ifndef NDEBUG + RegMax.installTraceFn([DAG](const SchedCandidate &Cand) { + PressureChange P = Cand.RPDelta.CurrentMax; + if (P.isValid()) + dbgs() << DAG->TRI->getRegPressureSetName(P.getPSet()) << ":" + << P.getUnitInc(); + }); +#endif + return &RegMax; +} + +} // end namespace llvm //===----------------------------------------------------------------------===// // BaseMemOpClusterMutation - DAG post-processing to cluster loads or stores. //===----------------------------------------------------------------------===// @@ -2642,162 +3033,132 @@ Policy.DemandResIdx = OtherCritIdx; } -#ifndef NDEBUG -const char *GenericSchedulerBase::getReasonStr( - GenericSchedulerBase::CandReason Reason) { - switch (Reason) { - case NoCand: return "NOCAND "; - case Only1: return "ONLY1 "; - case PhysReg: return "PHYS-REG "; - case RegExcess: return "REG-EXCESS"; - case RegCritical: return "REG-CRIT "; - case Stall: return "STALL "; - case Cluster: return "CLUSTER "; - case Weak: return "WEAK "; - case RegMax: return "REG-MAX "; - case ResourceReduce: return "RES-REDUCE"; - case ResourceDemand: return "RES-DEMAND"; - case TopDepthReduce: return "TOP-DEPTH "; - case TopPathReduce: return "TOP-PATH "; - case BotHeightReduce:return "BOT-HEIGHT"; - case BotPathReduce: return "BOT-PATH "; - case NextDefUse: return "DEF-USE "; - case NodeOrder: return "ORDER "; - }; - llvm_unreachable("Unknown reason!"); +bool GenericSchedulerBase::hasHigherPriority( + const SchedHeuristic *Reason1, const SchedHeuristic *Reason2) const { + if (!Reason1) + return false; + if (!Reason2) + return true; + assert(Reason1->Priority >= 0 && Reason2->Priority >= 0 && + "Uninitialized heuristic priority"); + return Reason1->Priority < Reason2->Priority; } -void GenericSchedulerBase::traceCandidate(const SchedCandidate &Cand) { - PressureChange P; - unsigned ResIdx = 0; - unsigned Latency = 0; - switch (Cand.Reason) { - default: - break; - case RegExcess: - P = Cand.RPDelta.Excess; - break; - case RegCritical: - P = Cand.RPDelta.CriticalMax; - break; - case RegMax: - P = Cand.RPDelta.CurrentMax; - break; - case ResourceReduce: - ResIdx = Cand.Policy.ReduceResIdx; - break; - case ResourceDemand: - ResIdx = Cand.Policy.DemandResIdx; - break; - case TopDepthReduce: - Latency = Cand.SU->getDepth(); - break; - case TopPathReduce: - Latency = Cand.SU->getHeight(); - break; - case BotHeightReduce: - Latency = Cand.SU->getHeight(); - break; - case BotPathReduce: - Latency = Cand.SU->getDepth(); - break; - } - dbgs() << " Cand SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason); - if (P.isValid()) - dbgs() << " " << TRI->getRegPressureSetName(P.getPSet()) - << ":" << P.getUnitInc() << " "; - else - dbgs() << " "; - if (ResIdx) - dbgs() << " " << SchedModel->getProcResource(ResIdx)->Name << " "; - else - dbgs() << " "; - if (Latency) - dbgs() << " " << Latency << " cycles "; - else - dbgs() << " "; - dbgs() << '\n'; +void GenericSchedulerBase::schedNode(SUnit *SU, bool IsTop) { + // Notify the heuristics to update its internal state after the SU has been + // scheduled. + for (auto *Heuristic : Heuristics) + Heuristic->update(*SU, IsTop); } -#endif -namespace llvm { -/// Return true if this heuristic determines order. -bool tryLess(int TryVal, int CandVal, - GenericSchedulerBase::SchedCandidate &TryCand, - GenericSchedulerBase::SchedCandidate &Cand, - GenericSchedulerBase::CandReason Reason) { - if (TryVal < CandVal) { - TryCand.Reason = Reason; - return true; - } - if (TryVal > CandVal) { - if (Cand.Reason > Reason) - Cand.Reason = Reason; - return true; - } - return false; +void GenericSchedulerBase::initPolicy(MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + unsigned NumRegionInstrs) { + // Clear the heuristics when entering the region as we will register all + // the heuristics basing on the region policy later. Keep the NodeOrder as the + // default heuristic. + Heuristics.clear(); + Heuristics.push_back(createNodeOrderHeuristic()); } -bool tryGreater(int TryVal, int CandVal, - GenericSchedulerBase::SchedCandidate &TryCand, - GenericSchedulerBase::SchedCandidate &Cand, - GenericSchedulerBase::CandReason Reason) { - if (TryVal > CandVal) { - TryCand.Reason = Reason; - return true; - } - if (TryVal < CandVal) { - if (Cand.Reason > Reason) - Cand.Reason = Reason; - return true; +void GenericSchedulerBase::registerHeuristic(SchedHeuristic &Heuristic, + SchedHeuristic *Before) { + assert(!std::count(Heuristics.begin(), Heuristics.end(), &Heuristic) && + "Heuristic is registered multiple times"); + assert(Heuristics.size() > 0 && + Heuristics[Heuristics.size() - 1] == createNodeOrderHeuristic() && + "NodeOrder heuristic is missing"); + if (!Before) + Before = Heuristics[Heuristics.size() - 1]; + + auto *Pos = std::find(Heuristics.begin(), Heuristics.end(), Before); + assert(Pos != Heuristics.end() && + "The heuristic to register before hasn't been registered"); + Heuristics.insert(Pos, &Heuristic); +} + +void GenericSchedulerBase::initialize(ScheduleDAGMI *DAG) { + if (Heuristics.size()) + LLVM_DEBUG(dbgs() << "Heuristics: "); + + // Assign the priority for each heuristics and initialize the heuristic. + for (unsigned Idx = 0; Idx < Heuristics.size(); ++Idx) { + LLVM_DEBUG(dbgs() << Heuristics[Idx]->Name; + if (Idx != Heuristics.size() - 1) dbgs() << " > "); + Heuristics[Idx]->Priority = Idx; + Heuristics[Idx]->initialize(DAG); } - return false; + + if (Heuristics.size()) + LLVM_DEBUG(dbgs() << '\n'); } -bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand, - GenericSchedulerBase::SchedCandidate &Cand, - SchedBoundary &Zone) { - if (Zone.isTop()) { - // Prefer the candidate with the lesser depth, but only if one of them has - // depth greater than the total latency scheduled so far, otherwise either - // of them could be scheduled now with no stall. - if (std::max(TryCand.SU->getDepth(), Cand.SU->getDepth()) > - Zone.getScheduledLatency()) { - if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(), - TryCand, Cand, GenericSchedulerBase::TopDepthReduce)) - return true; - } - if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(), - TryCand, Cand, GenericSchedulerBase::TopPathReduce)) - return true; - } else { - // Prefer the candidate with the lesser height, but only if one of them has - // height greater than the total latency scheduled so far, otherwise either - // of them could be scheduled now with no stall. - if (std::max(TryCand.SU->getHeight(), Cand.SU->getHeight()) > - Zone.getScheduledLatency()) { - if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(), - TryCand, Cand, GenericSchedulerBase::BotHeightReduce)) - return true; +/// Apply a set of heuristics to a new candidate. Heuristics are currently +/// hierarchical. This may be more efficient than a graduated cost model because +/// we don't need to evaluate all aspects of the model for each node in the +/// queue. But it's really done to make the heuristics easier to debug and +/// statistically analyze. +/// +/// \param Cand provides the policy and current best candidate. +/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized. +/// \param Zone describes the scheduled zone that we are extending, or nullptr +// if Cand is from a different zone than TryCand. +void GenericSchedulerBase::tryCandidate(SchedCandidate &Cand, + SchedCandidate &TryCand, + SchedBoundary *Zone) const { + // Initialize the candidate if needed. + if (!Cand.isValid()) { + TryCand.Reason = createNodeOrderHeuristic(); + return; + } + + for (auto *Reason : Heuristics) { + if (SchedCandidate *SC = Reason->tryCandidate(Cand, TryCand, Zone)) { + assert((SC == &Cand || SC == &TryCand) && + "Either Cand or TryCand is expected"); + if (SC == &TryCand || hasHigherPriority(Reason, SC->Reason)) + SC->Reason = Reason; + break; } - if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(), - TryCand, Cand, GenericSchedulerBase::BotPathReduce)) - return true; } - return false; } + +#ifndef NDEBUG +void GenericSchedulerBase::traceCandidate(const SchedCandidate &Cand) { + dbgs() << " Cand SU(" << Cand.SU->NodeNum << ") " + << (Cand.Reason ? Cand.Reason->name() : "NOCAND") << " "; + if (Cand.Reason) + Cand.Reason->trace(Cand); + + LLVM_DEBUG(dbgs() << '\n'); +} +#endif + +namespace llvm { +/// Return the candidate if this heuristic determines order. +GenericSchedulerBase::SchedCandidate * +tryLess(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, + GenericSchedulerBase::SchedCandidate &Cand) { + if (TryVal < CandVal) + return &TryCand; + if (TryVal > CandVal) + return &Cand; + return nullptr; +} + } // end namespace llvm -static void tracePick(GenericSchedulerBase::CandReason Reason, bool IsTop) { - LLVM_DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ") - << GenericSchedulerBase::getReasonStr(Reason) << '\n'); +static void tracePick(StringRef Name, bool IsTop) { + LLVM_DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ") << Name << '\n'); } static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand) { - tracePick(Cand.Reason, Cand.AtTop); + tracePick(Cand.Reason ? Cand.Reason->name() : "NOCAND", Cand.AtTop); } void GenericScheduler::initialize(ScheduleDAGMI *dag) { + GenericSchedulerBase::initialize(dag); + assert(dag->hasVRegLiveness() && "(PreRA)GenericScheduler needs vreg liveness"); DAG = static_cast(dag); @@ -2834,6 +3195,8 @@ void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, unsigned NumRegionInstrs) { + GenericSchedulerBase::initPolicy(Begin, End, NumRegionInstrs); + const MachineFunction &MF = *Begin->getMF(); const TargetLowering *TLI = MF.getSubtarget().getTargetLowering(); @@ -2947,43 +3310,40 @@ } namespace llvm { -bool tryPressure(const PressureChange &TryP, - const PressureChange &CandP, - GenericSchedulerBase::SchedCandidate &TryCand, - GenericSchedulerBase::SchedCandidate &Cand, - GenericSchedulerBase::CandReason Reason, - const TargetRegisterInfo *TRI, - const MachineFunction &MF) { +GenericSchedulerBase::SchedCandidate * +tryPressure(const PressureChange &TryP, const PressureChange &CandP, + GenericSchedulerBase::SchedCandidate &TryCand, + GenericSchedulerBase::SchedCandidate &Cand, + const TargetRegisterInfo *TRI, const MachineFunction &MF) { // If one candidate decreases and the other increases, go with it. // Invalid candidates have UnitInc==0. - if (tryGreater(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand, - Reason)) { - return true; - } + if (GenericSchedulerBase::SchedCandidate *SC = + tryLess(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, Cand, TryCand)) + return SC; + // Do not compare the magnitude of pressure changes between top and bottom // boundary. if (Cand.AtTop != TryCand.AtTop) - return false; + return nullptr; // If both candidates affect the same set in the same boundary, go with the // smallest increase. unsigned TryPSet = TryP.getPSetOrMax(); unsigned CandPSet = CandP.getPSetOrMax(); if (TryPSet == CandPSet) { - return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand, - Reason); + return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand); } - int TryRank = TryP.isValid() ? TRI->getRegPressureSetScore(MF, TryPSet) : - std::numeric_limits::max(); + int TryRank = TryP.isValid() ? TRI->getRegPressureSetScore(MF, TryPSet) + : std::numeric_limits::max(); - int CandRank = CandP.isValid() ? TRI->getRegPressureSetScore(MF, CandPSet) : - std::numeric_limits::max(); + int CandRank = CandP.isValid() ? TRI->getRegPressureSetScore(MF, CandPSet) + : std::numeric_limits::max(); // If the candidates are decreasing pressure, reverse priority. if (TryP.getUnitInc() < 0) std::swap(TryRank, CandRank); - return tryGreater(TryRank, CandRank, TryCand, Cand, Reason); + return tryLess(TryRank, CandRank, Cand, TryCand); } unsigned getWeakLeft(const SUnit *SU, bool isTop) { @@ -3071,119 +3431,6 @@ << Cand.RPDelta.Excess.getUnitInc() << "\n"); } -/// Apply a set of heuristics to a new candidate. Heuristics are currently -/// hierarchical. This may be more efficient than a graduated cost model because -/// we don't need to evaluate all aspects of the model for each node in the -/// queue. But it's really done to make the heuristics easier to debug and -/// statistically analyze. -/// -/// \param Cand provides the policy and current best candidate. -/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized. -/// \param Zone describes the scheduled zone that we are extending, or nullptr -// if Cand is from a different zone than TryCand. -void GenericScheduler::tryCandidate(SchedCandidate &Cand, - SchedCandidate &TryCand, - SchedBoundary *Zone) const { - // Initialize the candidate if needed. - if (!Cand.isValid()) { - TryCand.Reason = NodeOrder; - return; - } - - // Bias PhysReg Defs and copies to their uses and defined respectively. - if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop), - biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg)) - return; - - // Avoid exceeding the target's limit. - if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.Excess, - Cand.RPDelta.Excess, - TryCand, Cand, RegExcess, TRI, - DAG->MF)) - return; - - // Avoid increasing the max critical pressure in the scheduled region. - if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CriticalMax, - Cand.RPDelta.CriticalMax, - TryCand, Cand, RegCritical, TRI, - DAG->MF)) - return; - - // We only compare a subset of features when comparing nodes between - // Top and Bottom boundary. Some properties are simply incomparable, in many - // other instances we should only override the other boundary if something - // is a clear good pick on one boundary. Skip heuristics that are more - // "tie-breaking" in nature. - bool SameBoundary = Zone != nullptr; - if (SameBoundary) { - // For loops that are acyclic path limited, aggressively schedule for - // latency. Within an single cycle, whenever CurrMOps > 0, allow normal - // heuristics to take precedence. - if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() && - tryLatency(TryCand, Cand, *Zone)) - return; - - // Prioritize instructions that read unbuffered resources by stall cycles. - if (tryLess(Zone->getLatencyStallCycles(TryCand.SU), - Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) - return; - } - - // Keep clustered nodes together to encourage downstream peephole - // optimizations which may reduce resource requirements. - // - // This is a best effort to set things up for a post-RA pass. Optimizations - // like generating loads of multiple registers should ideally be done within - // the scheduler pass by combining the loads during DAG postprocessing. - const SUnit *CandNextClusterSU = - Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); - const SUnit *TryCandNextClusterSU = - TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); - if (tryGreater(TryCand.SU == TryCandNextClusterSU, - Cand.SU == CandNextClusterSU, - TryCand, Cand, Cluster)) - return; - - if (SameBoundary) { - // Weak edges are for clustering and other constraints. - if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop), - getWeakLeft(Cand.SU, Cand.AtTop), - TryCand, Cand, Weak)) - return; - } - - // Avoid increasing the max pressure of the entire region. - if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax, - Cand.RPDelta.CurrentMax, - TryCand, Cand, RegMax, TRI, - DAG->MF)) - return; - - if (SameBoundary) { - // Avoid critical resource consumption and balance the schedule. - TryCand.initResourceDelta(DAG, SchedModel); - if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, - TryCand, Cand, ResourceReduce)) - return; - if (tryGreater(TryCand.ResDelta.DemandedResources, - Cand.ResDelta.DemandedResources, - TryCand, Cand, ResourceDemand)) - return; - - // Avoid serializing long latency dependence chains. - // For acyclic path limited loops, latency was already checked above. - if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency && - !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone)) - return; - - // Fall through to original instruction order. - if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) - || (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { - TryCand.Reason = NodeOrder; - } - } -} - /// Pick the best candidate from the queue. /// /// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during @@ -3204,7 +3451,7 @@ // Pass SchedBoundary only when comparing nodes from the same boundary. SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; tryCandidate(Cand, TryCand, ZoneArg); - if (TryCand.Reason != NoCand) { + if (TryCand.Reason) { // Initialize resource delta if needed in case future heuristics query it. if (TryCand.ResDelta == SchedResourceDelta()) TryCand.initResourceDelta(DAG, SchedModel); @@ -3220,12 +3467,12 @@ // efficient, but also provides the best heuristics for CriticalPSets. if (SUnit *SU = Bot.pickOnlyChoice()) { IsTopNode = false; - tracePick(Only1, false); + tracePick("ONLY1", false); return SU; } if (SUnit *SU = Top.pickOnlyChoice()) { IsTopNode = true; - tracePick(Only1, true); + tracePick("ONLY1", true); return SU; } // Set the bottom-up policy based on the state of the current bottom zone and @@ -3243,7 +3490,7 @@ BotCand.Policy != BotPolicy) { BotCand.reset(CandPolicy()); pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand); - assert(BotCand.Reason != NoCand && "failed to find the first candidate"); + assert(BotCand.Reason && "failed to find the first candidate"); } else { LLVM_DEBUG(traceCandidate(BotCand)); #ifndef NDEBUG @@ -3263,7 +3510,7 @@ TopCand.Policy != TopPolicy) { TopCand.reset(CandPolicy()); pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand); - assert(TopCand.Reason != NoCand && "failed to find the first candidate"); + assert(TopCand.Reason && "failed to find the first candidate"); } else { LLVM_DEBUG(traceCandidate(TopCand)); #ifndef NDEBUG @@ -3281,9 +3528,9 @@ assert(BotCand.isValid()); assert(TopCand.isValid()); SchedCandidate Cand = BotCand; - TopCand.Reason = NoCand; + TopCand.Reason = nullptr; tryCandidate(Cand, TopCand, nullptr); - if (TopCand.Reason != NoCand) { + if (TopCand.Reason) { Cand.setBest(TopCand); LLVM_DEBUG(traceCandidate(Cand)); } @@ -3308,7 +3555,7 @@ CandPolicy NoPolicy; TopCand.reset(NoPolicy); pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand); - assert(TopCand.Reason != NoCand && "failed to find a candidate"); + assert(TopCand.Reason && "failed to find a candidate"); tracePick(TopCand); SU = TopCand.SU; } @@ -3319,7 +3566,7 @@ CandPolicy NoPolicy; BotCand.reset(NoPolicy); pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand); - assert(BotCand.Reason != NoCand && "failed to find a candidate"); + assert(BotCand.Reason && "failed to find a candidate"); tracePick(BotCand); SU = BotCand.SU; } @@ -3382,6 +3629,7 @@ if (SU->hasPhysRegDefs) reschedulePhysReg(SU, false); } + GenericSchedulerBase::schedNode(SU, IsTopNode); } /// Create the standard converging machine scheduler. This will be used as the @@ -3411,6 +3659,8 @@ //===----------------------------------------------------------------------===// void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) { + GenericSchedulerBase::initialize(Dag); + DAG = Dag; SchedModel = DAG->getSchedModel(); TRI = DAG->TRI; @@ -3443,48 +3693,6 @@ } } -/// Apply a set of heuristics to a new candidate for PostRA scheduling. -/// -/// \param Cand provides the policy and current best candidate. -/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized. -void PostGenericScheduler::tryCandidate(SchedCandidate &Cand, - SchedCandidate &TryCand) { - // Initialize the candidate if needed. - if (!Cand.isValid()) { - TryCand.Reason = NodeOrder; - return; - } - - // Prioritize instructions that read unbuffered resources by stall cycles. - if (tryLess(Top.getLatencyStallCycles(TryCand.SU), - Top.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) - return; - - // Keep clustered nodes together. - if (tryGreater(TryCand.SU == DAG->getNextClusterSucc(), - Cand.SU == DAG->getNextClusterSucc(), - TryCand, Cand, Cluster)) - return; - - // Avoid critical resource consumption and balance the schedule. - if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, - TryCand, Cand, ResourceReduce)) - return; - if (tryGreater(TryCand.ResDelta.DemandedResources, - Cand.ResDelta.DemandedResources, - TryCand, Cand, ResourceDemand)) - return; - - // Avoid serializing long latency dependence chains. - if (Cand.Policy.ReduceLatency && tryLatency(TryCand, Cand, Top)) { - return; - } - - // Fall through to original instruction order. - if (TryCand.SU->NodeNum < Cand.SU->NodeNum) - TryCand.Reason = NodeOrder; -} - void PostGenericScheduler::pickNodeFromQueue(SchedCandidate &Cand) { ReadyQueue &Q = Top.Available; for (SUnit *SU : Q) { @@ -3492,8 +3700,8 @@ TryCand.SU = SU; TryCand.AtTop = true; TryCand.initResourceDelta(DAG, SchedModel); - tryCandidate(Cand, TryCand); - if (TryCand.Reason != NoCand) { + tryCandidate(Cand, TryCand, &Top); + if (TryCand.Reason) { Cand.setBest(TryCand); LLVM_DEBUG(traceCandidate(Cand)); } @@ -3510,7 +3718,7 @@ do { SU = Top.pickOnlyChoice(); if (SU) { - tracePick(Only1, true); + tracePick("ONLY1", true); } else { CandPolicy NoPolicy; SchedCandidate TopCand(NoPolicy); @@ -3518,7 +3726,7 @@ // the instructions outside the zone, including the bottom zone. setPolicy(TopCand.Policy, /*IsPostRA=*/true, Top, nullptr); pickNodeFromQueue(TopCand); - assert(TopCand.Reason != NoCand && "failed to find a candidate"); + assert(TopCand.Reason && "failed to find a candidate"); tracePick(TopCand); SU = TopCand.SU; } @@ -3537,6 +3745,8 @@ void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) { SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle()); Top.bumpNode(SU); + + GenericSchedulerBase::schedNode(SU, IsTopNode); } ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) { diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -158,7 +158,7 @@ // Pass SchedBoundary only when comparing nodes from the same boundary. SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; GenericScheduler::tryCandidate(Cand, TryCand, ZoneArg); - if (TryCand.Reason != NoCand) { + if (TryCand.Reason) { // Initialize resource delta if needed in case future heuristics query it. if (TryCand.ResDelta == SchedResourceDelta()) TryCand.initResourceDelta(Zone.DAG, SchedModel); @@ -196,7 +196,7 @@ BotCand.Policy != BotPolicy) { BotCand.reset(CandPolicy()); pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand); - assert(BotCand.Reason != NoCand && "failed to find the first candidate"); + assert(BotCand.Reason && "failed to find the first candidate"); } else { LLVM_DEBUG(traceCandidate(BotCand)); #ifndef NDEBUG @@ -216,7 +216,7 @@ TopCand.Policy != TopPolicy) { TopCand.reset(CandPolicy()); pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand); - assert(TopCand.Reason != NoCand && "failed to find the first candidate"); + assert(TopCand.Reason && "failed to find the first candidate"); } else { LLVM_DEBUG(traceCandidate(TopCand)); #ifndef NDEBUG @@ -234,9 +234,9 @@ LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand); dbgs() << "Bot Cand: "; traceCandidate(BotCand);); SchedCandidate Cand = BotCand; - TopCand.Reason = NoCand; + TopCand.Reason = nullptr; GenericScheduler::tryCandidate(Cand, TopCand, nullptr); - if (TopCand.Reason != NoCand) { + if (TopCand.Reason) { Cand.setBest(TopCand); } LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand);); @@ -261,7 +261,7 @@ CandPolicy NoPolicy; TopCand.reset(NoPolicy); pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand); - assert(TopCand.Reason != NoCand && "failed to find a candidate"); + assert(TopCand.Reason && "failed to find a candidate"); SU = TopCand.SU; } IsTopNode = true; @@ -271,7 +271,7 @@ CandPolicy NoPolicy; BotCand.reset(NoPolicy); pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand); - assert(BotCand.Reason != NoCand && "failed to find a candidate"); + assert(BotCand.Reason && "failed to find a candidate"); SU = BotCand.SU; } IsTopNode = false; diff --git a/llvm/lib/Target/PowerPC/PPCMachineScheduler.h b/llvm/lib/Target/PowerPC/PPCMachineScheduler.h --- a/llvm/lib/Target/PowerPC/PPCMachineScheduler.h +++ b/llvm/lib/Target/PowerPC/PPCMachineScheduler.h @@ -17,34 +17,27 @@ namespace llvm { -/// A MachineSchedStrategy implementation for PowerPC pre RA scheduling. -class PPCPreRASchedStrategy : public GenericScheduler { +/// A Pre RA machine scheduler implementation for PowerPC target. +class PPCPreRAMachineScheduler : public ScheduleDAGMILive { public: - PPCPreRASchedStrategy(const MachineSchedContext *C) : - GenericScheduler(C) {} + PPCPreRAMachineScheduler(MachineSchedContext *C, + std::unique_ptr S) + : ScheduleDAGMILive(C, std::move(S)) {} + protected: - void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, - SchedBoundary *Zone) const override; -private: - bool biasAddiLoadCandidate(SchedCandidate &Cand, - SchedCandidate &TryCand, - SchedBoundary &Zone) const; + void registerHeuristics() override; }; -/// A MachineSchedStrategy implementation for PowerPC post RA scheduling. -class PPCPostRASchedStrategy : public PostGenericScheduler { +/// A Post RA machine scheduler implementation for PowerPC target. +class PPCPostRAMachineScheduler : public ScheduleDAGMI { public: - PPCPostRASchedStrategy(const MachineSchedContext *C) : - PostGenericScheduler(C) {} + PPCPostRAMachineScheduler(MachineSchedContext *C, + std::unique_ptr S, + bool RemoveKillFlags) + : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {} protected: - void initialize(ScheduleDAGMI *Dag) override; - SUnit *pickNode(bool &IsTopNode) override; - void enterMBB(MachineBasicBlock *MBB) override; - void leaveMBB() override; - - void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand) override; - bool biasAddiCandidate(SchedCandidate &Cand, SchedCandidate &TryCand) const; + void registerHeuristics() override; }; } // end namespace llvm diff --git a/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp b/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp --- a/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp +++ b/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp @@ -8,6 +8,7 @@ #include "PPCMachineScheduler.h" #include "MCTargetDesc/PPCMCTargetDesc.h" +#include "PPCSubtarget.h" using namespace llvm; @@ -26,94 +27,68 @@ Cand.SU->getInstr()->getOpcode() == PPC::ADDI8; } -bool PPCPreRASchedStrategy::biasAddiLoadCandidate(SchedCandidate &Cand, - SchedCandidate &TryCand, - SchedBoundary &Zone) const { - if (DisableAddiLoadHeuristic) - return false; - - SchedCandidate &FirstCand = Zone.isTop() ? TryCand : Cand; - SchedCandidate &SecondCand = Zone.isTop() ? Cand : TryCand; - if (isADDIInstr(FirstCand) && SecondCand.SU->getInstr()->mayLoad()) { - TryCand.Reason = Stall; - return true; - } - if (FirstCand.SU->getInstr()->mayLoad() && isADDIInstr(SecondCand)) { - TryCand.Reason = NoCand; - return true; - } - - return false; -} - -void PPCPreRASchedStrategy::tryCandidate(SchedCandidate &Cand, - SchedCandidate &TryCand, - SchedBoundary *Zone) const { - GenericScheduler::tryCandidate(Cand, TryCand, Zone); - - if (!Cand.isValid() || !Zone) - return; - - // Add powerpc specific heuristic only when TryCand isn't selected or - // selected as node order. - if (TryCand.Reason != NodeOrder && TryCand.Reason != NoCand) - return; +//===----------------------------------------------------------------------===// +// PowerPC specific heuristics +//===----------------------------------------------------------------------===// - // There are some benefits to schedule the ADDI before the load to hide the - // latency, as RA may create a true dependency between the load and addi. - if (biasAddiLoadCandidate(Cand, TryCand, *Zone)) - return; +// There are some benefits to schedule the ADDI before the load to hide the +// latency, as RA may create a true dependency between the load and addi. +SchedHeuristic *createBiasAddiLoadHeuristic() { + if (DisableAddiLoadHeuristic) + return nullptr; + + static SchedHeuristic BiasAddiLoad("ADDI-LOAD"); + BiasAddiLoad.installTryFn([](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone) + return nullptr; + SchedCandidate &FirstCand = Zone->isTop() ? TryCand : Cand; + SchedCandidate &SecondCand = Zone->isTop() ? Cand : TryCand; + if (isADDIInstr(FirstCand) && SecondCand.SU->getInstr()->mayLoad()) + return &TryCand; + if (FirstCand.SU->getInstr()->mayLoad() && isADDIInstr(SecondCand)) + return &Cand; + return nullptr; + }); + return &BiasAddiLoad; } -bool PPCPostRASchedStrategy::biasAddiCandidate(SchedCandidate &Cand, - SchedCandidate &TryCand) const { +// There are some benefits to schedule the ADDI as early as possible post ra +// to avoid stalled by vector instructions which take up all the hw units. +// And ADDI is usually used to post inc the loop indvar, which matters the +// performance. +SchedHeuristic *createBiasAddiHeuristic() { if (!EnableAddiHeuristic) - return false; - - if (isADDIInstr(TryCand) && !isADDIInstr(Cand)) { - TryCand.Reason = Stall; - return true; - } - return false; + return nullptr; + + static SchedHeuristic BiasAddi("BIAS-ADDI"); + BiasAddi.installTryFn([](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (isADDIInstr(TryCand) && !isADDIInstr(Cand)) + return &TryCand; + if (isADDIInstr(Cand) && !isADDIInstr(TryCand)) + return &Cand; + return nullptr; + }); + return &BiasAddi; } -void PPCPostRASchedStrategy::tryCandidate(SchedCandidate &Cand, - SchedCandidate &TryCand) { - PostGenericScheduler::tryCandidate(Cand, TryCand); - - if (!Cand.isValid()) - return; +void PPCPreRAMachineScheduler::registerHeuristics() { + ScheduleDAGMILive::registerHeuristics(); - // Add powerpc post ra specific heuristic only when TryCand isn't selected or - // selected as node order. - if (TryCand.Reason != NodeOrder && TryCand.Reason != NoCand) + const PPCSubtarget &ST = MF.getSubtarget(); + if (!ST.usePPCPreRASchedStrategy()) return; - // There are some benefits to schedule the ADDI as early as possible post ra - // to avoid stalled by vector instructions which take up all the hw units. - // And ADDI is usually used to post inc the loop indvar, which matters the - // performance. - if (biasAddiCandidate(Cand, TryCand)) - return; + registerHeuristic(createBiasAddiLoadHeuristic()); } -void PPCPostRASchedStrategy::enterMBB(MachineBasicBlock *MBB) { - // Custom PPC PostRA specific behavior here. - PostGenericScheduler::enterMBB(MBB); -} +void PPCPostRAMachineScheduler::registerHeuristics() { + ScheduleDAGMI::registerHeuristics(); -void PPCPostRASchedStrategy::leaveMBB() { - // Custom PPC PostRA specific behavior here. - PostGenericScheduler::leaveMBB(); -} - -void PPCPostRASchedStrategy::initialize(ScheduleDAGMI *Dag) { - // Custom PPC PostRA specific initialization here. - PostGenericScheduler::initialize(Dag); -} + const PPCSubtarget &ST = MF.getSubtarget(); + if (!ST.usePPCPostRASchedStrategy()) + return; -SUnit *PPCPostRASchedStrategy::pickNode(bool &IsTopNode) { - // Custom PPC PostRA specific scheduling here. - return PostGenericScheduler::pickNode(IsTopNode); + registerHeuristic(createBiasAddiHeuristic()); } - diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -266,24 +266,19 @@ static ScheduleDAGInstrs *createPPCMachineScheduler(MachineSchedContext *C) { const PPCSubtarget &ST = C->MF->getSubtarget(); ScheduleDAGMILive *DAG = - new ScheduleDAGMILive(C, ST.usePPCPreRASchedStrategy() ? - std::make_unique(C) : - std::make_unique(C)); + new PPCPreRAMachineScheduler(C, std::make_unique(C)); // add DAG Mutations here. DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI)); if (ST.hasFusion()) DAG->addMutation(createPowerPCMacroFusionDAGMutation()); - return DAG; } static ScheduleDAGInstrs *createPPCPostMachineScheduler( MachineSchedContext *C) { const PPCSubtarget &ST = C->MF->getSubtarget(); - ScheduleDAGMI *DAG = - new ScheduleDAGMI(C, ST.usePPCPostRASchedStrategy() ? - std::make_unique(C) : - std::make_unique(C), true); + ScheduleDAGMI *DAG = new PPCPostRAMachineScheduler( + C, std::make_unique(C), true); // add DAG Mutations here. if (ST.hasFusion()) DAG->addMutation(createPowerPCMacroFusionDAGMutation()); diff --git a/llvm/test/CodeGen/PowerPC/botheightreduce.mir b/llvm/test/CodeGen/PowerPC/botheightreduce.mir --- a/llvm/test/CodeGen/PowerPC/botheightreduce.mir +++ b/llvm/test/CodeGen/PowerPC/botheightreduce.mir @@ -26,7 +26,6 @@ ; CHECK: [[LI8_6:%[0-9]+]]:g8rc = LI8 7 ; CHECK: bb.1: ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; CHECK: [[ADDI8_1:%[0-9]+]]:g8rc = ADDI8 [[ADDI8_]], 1 ; CHECK: [[LD:%[0-9]+]]:g8rc = LD 0, [[ADDI8_]] :: (load 8) ; CHECK: [[LDX:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_]] :: (load 8) ; CHECK: [[LDX1:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_3]] :: (load 8) @@ -34,9 +33,10 @@ ; CHECK: [[LDX2:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_4]] :: (load 8) ; CHECK: [[LDX3:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_5]] :: (load 8) ; CHECK: [[LDX4:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_6]] :: (load 8) - ; CHECK: [[LDX5:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_2]] :: (load 8) - ; CHECK: [[MULLD:%[0-9]+]]:g8rc = MULLD [[LDX]], [[LD]] ; CHECK: [[LD2:%[0-9]+]]:g8rc = LD 8, [[ADDI8_]] :: (load 8) + ; CHECK: [[MULLD:%[0-9]+]]:g8rc = MULLD [[LDX]], [[LD]] + ; CHECK: [[LDX5:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_2]] :: (load 8) + ; CHECK: [[ADDI8_1:%[0-9]+]]:g8rc = ADDI8 [[ADDI8_]], 1 ; CHECK: [[MULLD1:%[0-9]+]]:g8rc = MULLD [[MULLD]], [[LDX5]] ; CHECK: [[MULLD2:%[0-9]+]]:g8rc = MULLD [[MULLD1]], [[LDX1]] ; CHECK: [[MULLD3:%[0-9]+]]:g8rc = MULLD [[MULLD2]], [[LD1]] diff --git a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll --- a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll +++ b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll @@ -116,15 +116,14 @@ ; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB1_2: -; CHECK-NEXT: ldx r9, r6, r7 -; CHECK-NEXT: ld r10, 0(r6) -; CHECK-NEXT: ldx r11, r6, r5 -; CHECK-NEXT: addi r8, r6, 1 -; CHECK-NEXT: ld r6, 4(r6) -; CHECK-NEXT: mulld r9, r10, r9 -; CHECK-NEXT: mulld r9, r9, r11 -; CHECK-NEXT: maddld r3, r9, r6, r3 -; CHECK-NEXT: mr r6, r8 +; CHECK-NEXT: ldx r8, r6, r7 +; CHECK-NEXT: ld r9, 0(r6) +; CHECK-NEXT: ldx r10, r6, r5 +; CHECK-NEXT: ld r11, 4(r6) +; CHECK-NEXT: addi r6, r6, 1 +; CHECK-NEXT: mulld r8, r9, r8 +; CHECK-NEXT: mulld r8, r8, r10 +; CHECK-NEXT: maddld r3, r8, r11, r3 ; CHECK-NEXT: bdnz .LBB1_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: add r3, r3, r4 @@ -217,25 +216,24 @@ ; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB2_2: -; CHECK-NEXT: ldx r12, r9, r6 -; CHECK-NEXT: ld r0, 0(r9) -; CHECK-NEXT: ldx r30, r9, r5 -; CHECK-NEXT: ldx r29, r9, r7 -; CHECK-NEXT: addi r11, r9, 1 -; CHECK-NEXT: mulld r12, r0, r12 -; CHECK-NEXT: ld r28, 4(r9) -; CHECK-NEXT: ldx r27, r9, r8 -; CHECK-NEXT: ld r26, 12(r9) -; CHECK-NEXT: ld r25, 8(r9) -; CHECK-NEXT: ldx r9, r9, r10 -; CHECK-NEXT: mulld r12, r12, r30 -; CHECK-NEXT: mulld r12, r12, r29 -; CHECK-NEXT: mulld r12, r12, r28 -; CHECK-NEXT: mulld r12, r12, r27 -; CHECK-NEXT: mulld r12, r12, r26 -; CHECK-NEXT: mulld r12, r12, r25 -; CHECK-NEXT: maddld r3, r12, r9, r3 -; CHECK-NEXT: mr r9, r11 +; CHECK-NEXT: ldx r11, r9, r6 +; CHECK-NEXT: ld r12, 0(r9) +; CHECK-NEXT: ldx r0, r9, r5 +; CHECK-NEXT: ldx r30, r9, r7 +; CHECK-NEXT: mulld r11, r12, r11 +; CHECK-NEXT: ld r29, 4(r9) +; CHECK-NEXT: ldx r28, r9, r8 +; CHECK-NEXT: ld r27, 12(r9) +; CHECK-NEXT: ld r26, 8(r9) +; CHECK-NEXT: ldx r25, r9, r10 +; CHECK-NEXT: addi r9, r9, 1 +; CHECK-NEXT: mulld r11, r11, r0 +; CHECK-NEXT: mulld r11, r11, r30 +; CHECK-NEXT: mulld r11, r11, r29 +; CHECK-NEXT: mulld r11, r11, r28 +; CHECK-NEXT: mulld r11, r11, r27 +; CHECK-NEXT: mulld r11, r11, r26 +; CHECK-NEXT: maddld r3, r11, r25, r3 ; CHECK-NEXT: bdnz .LBB2_2 ; CHECK-NEXT: b .LBB2_4 ; CHECK-NEXT: .LBB2_3: @@ -624,10 +622,10 @@ ; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-NEXT: beq cr0, .LBB6_8 ; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha ; CHECK-NEXT: cmpldi r4, 1 ; CHECK-NEXT: li r7, 1 ; CHECK-NEXT: addi r6, r3, 4009 -; CHECK-NEXT: addis r5, r2, .LC0@toc@ha ; CHECK-NEXT: ld r5, .LC0@toc@l(r5) ; CHECK-NEXT: iselgt r8, r4, r7 ; CHECK-NEXT: lis r4, -21846 @@ -639,11 +637,11 @@ ; CHECK-NEXT: li r30, 1 ; CHECK-NEXT: ld r5, 0(r5) ; CHECK-NEXT: mtctr r8 -; CHECK-NEXT: li r8, -9 -; CHECK-NEXT: addi r5, r5, -1 ; CHECK-NEXT: ori r4, r4, 43691 +; CHECK-NEXT: li r8, -9 ; CHECK-NEXT: li r29, 1 ; CHECK-NEXT: li r28, 1 +; CHECK-NEXT: addi r5, r5, -1 ; CHECK-NEXT: b .LBB6_4 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB6_2: @@ -652,8 +650,8 @@ ; CHECK-NEXT: ld r0, -8(r6) ; CHECK-NEXT: add r29, r0, r29 ; CHECK-NEXT: .LBB6_3: -; CHECK-NEXT: addi r6, r6, 1 ; CHECK-NEXT: mulld r0, r29, r28 +; CHECK-NEXT: addi r6, r6, 1 ; CHECK-NEXT: mulld r0, r0, r30 ; CHECK-NEXT: mulld r0, r0, r12 ; CHECK-NEXT: mulld r0, r0, r11 @@ -802,8 +800,8 @@ ; CHECK-NEXT: cmpwi r4, 1 ; CHECK-NEXT: blt cr0, .LBB7_4 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: addi r3, r3, 4002 ; CHECK-NEXT: clrldi r4, r4, 32 +; CHECK-NEXT: addi r3, r3, 4002 ; CHECK-NEXT: xxlxor f1, f1, f1 ; CHECK-NEXT: mtctr r4 ; CHECK-NEXT: li r4, -1 @@ -884,8 +882,8 @@ ; CHECK-NEXT: cmpwi r4, 1 ; CHECK-NEXT: blt cr0, .LBB8_4 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: addi r3, r3, 4002 ; CHECK-NEXT: clrldi r4, r4, 32 +; CHECK-NEXT: addi r3, r3, 4002 ; CHECK-NEXT: xxlxor f1, f1, f1 ; CHECK-NEXT: mtctr r4 ; CHECK-NEXT: li r4, -1 diff --git a/llvm/test/CodeGen/PowerPC/rematerializable-instruction-machine-licm.ll b/llvm/test/CodeGen/PowerPC/rematerializable-instruction-machine-licm.ll --- a/llvm/test/CodeGen/PowerPC/rematerializable-instruction-machine-licm.ll +++ b/llvm/test/CodeGen/PowerPC/rematerializable-instruction-machine-licm.ll @@ -374,9 +374,9 @@ ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: ld 12, 384(1) # 8-byte Folded Reload ; CHECK-NEXT: lwz 4, 396(1) # 4-byte Folded Reload -; CHECK-NEXT: addi 4, 4, 1 ; CHECK-NEXT: std 3, 0(12) ; CHECK-NEXT: ld 12, 376(1) # 8-byte Folded Reload +; CHECK-NEXT: addi 4, 4, 1 ; CHECK-NEXT: std 3, 0(12) ; CHECK-NEXT: ld 12, 368(1) # 8-byte Folded Reload ; CHECK-NEXT: std 3, 0(12) diff --git a/llvm/test/CodeGen/PowerPC/sched-addi.ll b/llvm/test/CodeGen/PowerPC/sched-addi.ll --- a/llvm/test/CodeGen/PowerPC/sched-addi.ll +++ b/llvm/test/CodeGen/PowerPC/sched-addi.ll @@ -15,8 +15,8 @@ ; CHECK-P9-NEXT: ld 5, 0(5) ; CHECK-P9-NEXT: addis 6, 2, scalars@toc@ha ; CHECK-P9-NEXT: addi 6, 6, scalars@toc@l -; CHECK-P9-NEXT: addi 6, 6, 16 ; CHECK-P9-NEXT: rldicr 5, 5, 0, 58 +; CHECK-P9-NEXT: addi 6, 6, 16 ; CHECK-P9-NEXT: addi 5, 5, -32 ; CHECK-P9-NEXT: lxvdsx 0, 0, 6 ; CHECK-P9-NEXT: rldicl 5, 5, 59, 5 @@ -35,9 +35,9 @@ ; CHECK-P9-NEXT: xvmuldp 1, 1, 0 ; CHECK-P9-NEXT: xvmuldp 4, 4, 0 ; CHECK-P9-NEXT: xvmuldp 3, 3, 0 +; CHECK-P9-NEXT: xvmuldp 6, 6, 0 ; CHECK-P9-NEXT: xvmuldp 5, 5, 0 ; CHECK-P9-NEXT: addi 4, 4, 256 -; CHECK-P9-NEXT: xvmuldp 6, 6, 0 ; CHECK-P9-NEXT: stxv 1, 16(3) ; CHECK-P9-NEXT: stxv 2, 0(3) ; CHECK-P9-NEXT: stxv 3, 48(3) diff --git a/llvm/test/CodeGen/PowerPC/sms-cpy-1.ll b/llvm/test/CodeGen/PowerPC/sms-cpy-1.ll --- a/llvm/test/CodeGen/PowerPC/sms-cpy-1.ll +++ b/llvm/test/CodeGen/PowerPC/sms-cpy-1.ll @@ -22,22 +22,22 @@ ; CHECK-NEXT: isellt 3, 3, 4 ; CHECK-NEXT: li 4, 0 ; CHECK-NEXT: addi 3, 3, 1 -; CHECK-NEXT: li 7, -1 ; CHECK-NEXT: li 5, 0 +; CHECK-NEXT: li 7, -1 ; CHECK-NEXT: mtctr 3 -; CHECK-NEXT: li 3, 1 ; CHECK-NEXT: lbz 5, 0(5) +; CHECK-NEXT: li 3, 1 ; CHECK-NEXT: bdz .LBB0_6 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: addi 3, 3, 1 -; CHECK-NEXT: addi 8, 7, -1 ; CHECK-NEXT: xori 6, 5, 84 ; CHECK-NEXT: clrldi 5, 7, 32 +; CHECK-NEXT: addi 3, 3, 1 +; CHECK-NEXT: addi 8, 7, -1 ; CHECK-NEXT: lbz 5, 0(5) ; CHECK-NEXT: bdz .LBB0_5 ; CHECK-NEXT: # %bb.2: -; CHECK-NEXT: addi 3, 3, 1 ; CHECK-NEXT: cntlzw 6, 6 +; CHECK-NEXT: addi 3, 3, 1 ; CHECK-NEXT: srwi 7, 6, 5 ; CHECK-NEXT: xori 6, 5, 84 ; CHECK-NEXT: clrldi 5, 8, 32 @@ -45,12 +45,12 @@ ; CHECK-NEXT: lbz 5, 0(5) ; CHECK-NEXT: bdz .LBB0_4 ; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: addi 3, 3, 1 ; CHECK-NEXT: clrldi 10, 8, 32 -; CHECK-NEXT: addi 8, 8, -1 ; CHECK-NEXT: cntlzw 9, 6 ; CHECK-NEXT: xori 6, 5, 84 +; CHECK-NEXT: addi 8, 8, -1 ; CHECK-NEXT: lbz 5, 0(10) +; CHECK-NEXT: addi 3, 3, 1 ; CHECK-NEXT: add 4, 4, 7 ; CHECK-NEXT: srwi 7, 9, 5 ; CHECK-NEXT: bdnz .LBB0_3 diff --git a/llvm/test/CodeGen/PowerPC/sms-phi-1.ll b/llvm/test/CodeGen/PowerPC/sms-phi-1.ll --- a/llvm/test/CodeGen/PowerPC/sms-phi-1.ll +++ b/llvm/test/CodeGen/PowerPC/sms-phi-1.ll @@ -14,17 +14,17 @@ ; CHECK-NEXT: mr 30, 3 ; CHECK-NEXT: bl calloc ; CHECK-NEXT: nop +; CHECK-NEXT: clrldi 4, 30, 32 ; CHECK-NEXT: li 5, 0 ; CHECK-NEXT: addi 3, 3, -4 -; CHECK-NEXT: li 6, 1 -; CHECK-NEXT: clrldi 4, 30, 32 ; CHECK-NEXT: mtctr 4 ; CHECK-NEXT: mullw 4, 5, 5 +; CHECK-NEXT: li 6, 1 ; CHECK-NEXT: bdz .LBB0_3 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: addi 5, 6, 1 ; CHECK-NEXT: stwu 4, 4(3) ; CHECK-NEXT: mullw 4, 6, 6 +; CHECK-NEXT: addi 5, 6, 1 ; CHECK-NEXT: bdz .LBB0_3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: diff --git a/llvm/test/CodeGen/PowerPC/sms-simple.ll b/llvm/test/CodeGen/PowerPC/sms-simple.ll --- a/llvm/test/CodeGen/PowerPC/sms-simple.ll +++ b/llvm/test/CodeGen/PowerPC/sms-simple.ll @@ -9,15 +9,15 @@ define dso_local i32* @foo() local_unnamed_addr { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis r5, r2, x@toc@ha -; CHECK-NEXT: addis r6, r2, y@toc@ha +; CHECK-NEXT: addis r5, r2, y@toc@ha ; CHECK-NEXT: li r7, 340 -; CHECK-NEXT: addi r5, r5, x@toc@l -; CHECK-NEXT: addi r5, r5, -8 -; CHECK-NEXT: addi r3, r6, y@toc@l -; CHECK-NEXT: lwz r6, y@toc@l(r6) +; CHECK-NEXT: addi r3, r5, y@toc@l +; CHECK-NEXT: lwz r6, y@toc@l(r5) +; CHECK-NEXT: addis r5, r2, x@toc@ha ; CHECK-NEXT: mtctr r7 +; CHECK-NEXT: addi r5, r5, x@toc@l ; CHECK-NEXT: addi r4, r3, -8 +; CHECK-NEXT: addi r5, r5, -8 ; CHECK-NEXT: lwzu r7, 12(r5) ; CHECK-NEXT: maddld r6, r7, r7, r6 ; CHECK-NEXT: lwz r7, 4(r5) diff --git a/llvm/test/CodeGen/PowerPC/stack-clash-dynamic-alloca.ll b/llvm/test/CodeGen/PowerPC/stack-clash-dynamic-alloca.ll --- a/llvm/test/CodeGen/PowerPC/stack-clash-dynamic-alloca.ll +++ b/llvm/test/CodeGen/PowerPC/stack-clash-dynamic-alloca.ll @@ -50,9 +50,9 @@ ; CHECK-P9-LE-NEXT: std r31, -8(r1) ; CHECK-P9-LE-NEXT: stdu r1, -48(r1) ; CHECK-P9-LE-NEXT: rldic r3, r3, 2, 30 -; CHECK-P9-LE-NEXT: addi r3, r3, 15 ; CHECK-P9-LE-NEXT: li r6, -32768 ; CHECK-P9-LE-NEXT: mr r31, r1 +; CHECK-P9-LE-NEXT: addi r3, r3, 15 ; CHECK-P9-LE-NEXT: rldicl r3, r3, 60, 4 ; CHECK-P9-LE-NEXT: rldicl r3, r3, 4, 29 ; CHECK-P9-LE-NEXT: neg r5, r3 @@ -189,9 +189,9 @@ ; CHECK-P9-LE-NEXT: std r31, -8(r1) ; CHECK-P9-LE-NEXT: stdu r1, -48(r1) ; CHECK-P9-LE-NEXT: rldic r4, r3, 2, 30 -; CHECK-P9-LE-NEXT: addi r4, r4, 15 ; CHECK-P9-LE-NEXT: li r7, -4096 ; CHECK-P9-LE-NEXT: mr r31, r1 +; CHECK-P9-LE-NEXT: addi r4, r4, 15 ; CHECK-P9-LE-NEXT: rldicl r4, r4, 60, 4 ; CHECK-P9-LE-NEXT: rldicl r4, r4, 4, 29 ; CHECK-P9-LE-NEXT: neg r6, r4 @@ -333,10 +333,10 @@ ; CHECK-P9-LE-NEXT: std r31, -8(r1) ; CHECK-P9-LE-NEXT: stdu r1, -48(r1) ; CHECK-P9-LE-NEXT: rldic r3, r3, 2, 30 -; CHECK-P9-LE-NEXT: addi r3, r3, 15 ; CHECK-P9-LE-NEXT: lis r5, -1 -; CHECK-P9-LE-NEXT: ori r5, r5, 0 ; CHECK-P9-LE-NEXT: mr r31, r1 +; CHECK-P9-LE-NEXT: addi r3, r3, 15 +; CHECK-P9-LE-NEXT: ori r5, r5, 0 ; CHECK-P9-LE-NEXT: rldicl r3, r3, 60, 4 ; CHECK-P9-LE-NEXT: rldicl r3, r3, 4, 29 ; CHECK-P9-LE-NEXT: neg r6, r3