diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h --- a/llvm/include/llvm/CodeGen/MachineScheduler.h +++ b/llvm/include/llvm/CodeGen/MachineScheduler.h @@ -91,6 +91,7 @@ #include "llvm/Support/ErrorHandling.h" #include #include +#include #include #include #include @@ -109,6 +110,7 @@ class MachineLoopInfo; class RegisterClassInfo; class SchedDFSResult; +class SchedHeuristic; class ScheduleHazardRecognizer; class TargetInstrInfo; class TargetPassConfig; @@ -219,6 +221,10 @@ /// This has to be enabled in combination with shouldTrackPressure(). virtual bool shouldTrackLaneMasks() const { return false; } + /// Returns true if disable heuristic that tries to fetch nodes from long + /// dependency chains first. + virtual bool disableLatencyHeuristic() const { return false; } + // If this method returns true, handling of the scheduling regions // themselves (in case of a scheduling boundary in MBB) will be done // beginning with the topmost region of MBB. @@ -237,6 +243,10 @@ /// that depend on EntrySU or ExitSU). virtual void registerRoots() {} + /// Register the Heuristic before \p Before. + virtual void registerHeuristic(SchedHeuristic &Heuristic, + SchedHeuristic *Before) {} + /// Pick the next node to schedule, or return NULL. Set IsTopNode to true to /// schedule the node at the top of the unscheduled region. Otherwise it will /// be scheduled at the bottom. @@ -355,6 +365,18 @@ /// instances of ScheduleDAGMI to perform custom DAG postprocessing. void postprocessDAG(); + /// Register all the heuristics for the scheduler. The earlier it is + /// registered, the higher priority it has. + virtual void registerHeuristics(); + + /// Wrapper to register heuristic. + void registerHeuristic(SchedHeuristic *Heuristic, + SchedHeuristic *Before = nullptr) { + if (!Heuristic) + return; + SchedImpl->registerHeuristic(*Heuristic, Before); + } + /// Release ExitSU predecessors and setup scheduler queues. void initQueues(ArrayRef TopRoots, ArrayRef BotRoots); @@ -406,6 +428,9 @@ IntervalPressure RegPressure; RegPressureTracker RPTracker; + /// Disable the heuristic for long dependency chain in this region. + bool DisableLatencyHeuristic = false; + /// List of pressure sets that exceed the target's pressure limit before /// scheduling, listed in increasing set ID order. Each pressure set is paired /// with its max pressure in the currently scheduled regions. @@ -438,6 +463,9 @@ /// Return true if register pressure tracking is enabled. bool isTrackingPressure() const { return ShouldTrackPressure; } + /// Return true if latency heuristic is disabled. + bool disableLatencyHeuristic() const { return DisableLatencyHeuristic; } + /// Get current register pressure for the top scheduled instructions. const IntervalPressure &getTopPressure() const { return TopPressure; } const RegPressureTracker &getTopRPTracker() const { return TopRPTracker; } @@ -502,6 +530,8 @@ /// Move an instruction and update register pressure. void scheduleMI(SUnit *SU, bool IsTopNode); + void registerHeuristics() override; + // Lesser helpers... void initRegPressure(); @@ -796,17 +826,6 @@ /// heuristics for either preRA or postRA scheduling. class GenericSchedulerBase : public MachineSchedStrategy { public: - /// Represent the type of SchedCandidate found within a single queue. - /// pickNodeBidirectional depends on these listed by decreasing priority. - enum CandReason : uint8_t { - NoCand, Only1, PhysReg, RegExcess, RegCritical, Stall, Cluster, Weak, - RegMax, ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce, - TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder}; - -#ifndef NDEBUG - static const char *getReasonStr(GenericSchedulerBase::CandReason Reason); -#endif - /// Policy for scheduling the next instruction in the candidate's zone. struct CandPolicy { bool ReduceLatency = false; @@ -852,8 +871,8 @@ // The best SUnit candidate. SUnit *SU; - // The reason for this candidate. - CandReason Reason; + // The heuristic reason for this candidate. + SchedHeuristic *Reason = nullptr; // Whether this candidate should be scheduled at top/bottom. bool AtTop; @@ -870,7 +889,7 @@ void reset(const CandPolicy &NewPolicy) { Policy = NewPolicy; SU = nullptr; - Reason = NoCand; + Reason = nullptr; AtTop = false; RPDelta = RegPressureDelta(); ResDelta = SchedResourceDelta(); @@ -880,7 +899,7 @@ // Copy the status of another candidate without changing policy. void setBest(SchedCandidate &Best) { - assert(Best.Reason != NoCand && "uninitialized Sched candidate"); + assert(Best.Reason && "uninitialized Sched candidate"); SU = Best.SU; Reason = Best.Reason; AtTop = Best.AtTop; @@ -892,6 +911,15 @@ const TargetSchedModel *SchedModel); }; + // Register the \p Heuristic before \p Before. If didn't specify the \Before, + // it will be registered before the last heuristic which is the NodeOrder. + void registerHeuristic(SchedHeuristic &Heuristic, + SchedHeuristic *Before) override; + + void initPolicy(MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + unsigned NumRegionInstrs) override; + protected: const MachineSchedContext *Context; const TargetSchedModel *SchedModel = nullptr; @@ -899,11 +927,21 @@ SchedRemainder Rem; + // Keep all the heuristics. Notice that, the earlier it is pushed into the + // vector, the higher priority it has. + SmallVector Heuristics; + GenericSchedulerBase(const MachineSchedContext *C) : Context(C) {} void setPolicy(CandPolicy &Policy, bool IsPostRA, SchedBoundary &CurrZone, SchedBoundary *OtherZone); + void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) const; + bool hasHigherPriority(const SchedHeuristic *Reason1, + const SchedHeuristic *Reason2) const; + void initialize(ScheduleDAGMI *DAG) override; + void schedNode(SUnit *SU, bool IsTopNode) override; #ifndef NDEBUG void traceCandidate(const SchedCandidate &Cand); #endif @@ -914,24 +952,16 @@ }; // Utility functions used by heuristics in tryCandidate(). -bool tryLess(int TryVal, int CandVal, - GenericSchedulerBase::SchedCandidate &TryCand, - GenericSchedulerBase::SchedCandidate &Cand, - GenericSchedulerBase::CandReason Reason); -bool tryGreater(int TryVal, int CandVal, - GenericSchedulerBase::SchedCandidate &TryCand, - GenericSchedulerBase::SchedCandidate &Cand, - GenericSchedulerBase::CandReason Reason); -bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand, - GenericSchedulerBase::SchedCandidate &Cand, - SchedBoundary &Zone); -bool tryPressure(const PressureChange &TryP, - const PressureChange &CandP, - GenericSchedulerBase::SchedCandidate &TryCand, - GenericSchedulerBase::SchedCandidate &Cand, - GenericSchedulerBase::CandReason Reason, - const TargetRegisterInfo *TRI, - const MachineFunction &MF); +GenericSchedulerBase::SchedCandidate * +tryLess(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, + GenericSchedulerBase::SchedCandidate &Cand); + +GenericSchedulerBase::SchedCandidate * +tryPressure(const PressureChange &TryP, const PressureChange &CandP, + GenericSchedulerBase::SchedCandidate &TryCand, + GenericSchedulerBase::SchedCandidate &Cand, + const TargetRegisterInfo *TRI, const MachineFunction &MF); + unsigned getWeakLeft(const SUnit *SU, bool isTop); int biasPhysReg(const SUnit *SU, bool isTop); @@ -957,6 +987,10 @@ return RegionPolicy.ShouldTrackLaneMasks; } + bool disableLatencyHeuristic() const override { + return RegionPolicy.DisableLatencyHeuristic; + } + void initialize(ScheduleDAGMI *dag) override; SUnit *pickNode(bool &IsTopNode) override; @@ -1001,9 +1035,6 @@ const RegPressureTracker &RPTracker, RegPressureTracker &TempTracker); - virtual void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, - SchedBoundary *Zone) const; - SUnit *pickNodeBidirectional(bool &IsTopNode); void pickNodeFromQueue(SchedBoundary &Zone, @@ -1031,12 +1062,6 @@ ~PostGenericScheduler() override = default; - void initPolicy(MachineBasicBlock::iterator Begin, - MachineBasicBlock::iterator End, - unsigned NumRegionInstrs) override { - /* no configurable policy */ - } - /// PostRA scheduling does not track pressure. bool shouldTrackPressure() const override { return false; } @@ -1064,7 +1089,6 @@ } protected: - virtual void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand); void pickNodeFromQueue(SchedCandidate &Cand); }; @@ -1089,6 +1113,78 @@ createCopyConstrainDAGMutation(const TargetInstrInfo *TII, const TargetRegisterInfo *TRI); +//===----------------------------------------------------------------------===// +// Predefined scheduling heuristics +//===----------------------------------------------------------------------===// +using SchedCandidate = GenericSchedulerBase::SchedCandidate; +class SchedHeuristic { +public: + friend class GenericSchedulerBase; + using TryFn = std::function; + using InitFn = std::function; + using UpdateFn = std::function; + using TraceFn = std::function; + + SchedHeuristic(StringRef Name) : Name(Name) {} + + // Return either Can or TryCand if it wins. Otherwise, return nullptr. + SchedCandidate *tryCandidate(SchedCandidate &Can, SchedCandidate &TryCand, + SchedBoundary *Zone) { + assert(TryCandidate && "Missing TryCandidate for heuristic"); + return TryCandidate(Can, TryCand, Zone); + } + // Notify the heuristic to update the internal state after SU is scheduled. + void update(SUnit &SU, bool IsTop) { + if (Update) + Update(SU, IsTop); + } + // Initialize the heuristic whenever entering region + void initialize(ScheduleDAGMI *DAG) { + if (Initialize) + Initialize(DAG); + } + + void installTryFn(TryFn Fn) { TryCandidate = Fn; } + void installInitFn(InitFn Fn) { Initialize = Fn; } + void installUpdateFn(UpdateFn Fn) { Update = Fn; } +#ifndef NDEBUG + void installTraceFn(TraceFn Fn) { Trace = Fn; } + void trace(const SchedCandidate &Can) { + if (Trace) + Trace(Can); + } +#endif + StringRef name() const { return Name; } + +protected: + // Heuristic name + StringRef Name; + // The internal priority. + int Priority = -1; + TryFn TryCandidate = nullptr; + UpdateFn Update = nullptr; + InitFn Initialize = nullptr; +#ifndef NDEBUG + TraceFn Trace = nullptr; +#endif +}; + +SchedHeuristic *createPhysRegHeuristic(); +SchedHeuristic *createNodeOrderHeuristic(); +SchedHeuristic *createWeakHeuristic(); +SchedHeuristic *createStallHeuristic(); +SchedHeuristic *createClusterHeuristic(ScheduleDAGMI *DAG); +SchedHeuristic *createTopDepthReduceHeuristic(bool PreRA); +SchedHeuristic *createTopPathReduceHeuristic(bool PreRA); +SchedHeuristic *createBotHeightReduceHeuristic(bool PreRA); +SchedHeuristic *createBotPathReduceHeuristic(bool PreRA); +SchedHeuristic *createResourceReduceHeuristic(ScheduleDAGMI *DAG, bool PreRA); +SchedHeuristic *createResourceDemandHeuristic(ScheduleDAGMI *DAG); +SchedHeuristic *createRegExcessHeuristic(ScheduleDAGMILive *DAG); +SchedHeuristic *createRegCritMaxHeuristic(ScheduleDAGMILive *DAG); +SchedHeuristic *createRegMaxHeuristic(ScheduleDAGMILive *DAG); + } // end namespace llvm #endif // LLVM_CODEGEN_MACHINESCHEDULER_H diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -714,6 +714,17 @@ ScheduleDAGInstrs::finishBlock(); } +void ScheduleDAGMI::registerHeuristics() { + registerHeuristic(createStallHeuristic()); + registerHeuristic(createClusterHeuristic(this)); + registerHeuristic(createResourceReduceHeuristic(this, false)); + registerHeuristic(createResourceDemandHeuristic(this)); + registerHeuristic(createTopDepthReduceHeuristic(false)); + registerHeuristic(createTopPathReduceHeuristic(false)); + registerHeuristic(createBotHeightReduceHeuristic(false)); + registerHeuristic(createBotPathReduceHeuristic(false)); +} + /// enterRegion - Called back from MachineScheduler::runOnMachineFunction after /// crossing a scheduling boundary. [begin, end) includes all instructions in /// the region, including the boundary itself and single-instruction regions @@ -772,6 +783,8 @@ postprocessDAG(); + registerHeuristics(); + SmallVector TopRoots, BotRoots; findRootsAndBiasEdges(TopRoots, BotRoots); @@ -1007,6 +1020,36 @@ "ShouldTrackLaneMasks requires ShouldTrackPressure"); } +void ScheduleDAGMILive::registerHeuristics() { + registerHeuristic(createPhysRegHeuristic()); + + if (isTrackingPressure()) { + registerHeuristic(createRegExcessHeuristic(this)); + registerHeuristic(createRegCritMaxHeuristic(this)); + } + + registerHeuristic(createTopDepthReduceHeuristic(true)); + registerHeuristic(createTopPathReduceHeuristic(true)); + registerHeuristic(createBotHeightReduceHeuristic(true)); + registerHeuristic(createBotPathReduceHeuristic(true)); + registerHeuristic(createStallHeuristic()); + registerHeuristic(createClusterHeuristic(this)); + registerHeuristic(createWeakHeuristic()); + + if (isTrackingPressure()) + registerHeuristic(createRegMaxHeuristic(this)); + + registerHeuristic(createResourceReduceHeuristic(this, true)); + registerHeuristic(createResourceDemandHeuristic(this)); + + if (!SchedImpl->disableLatencyHeuristic()) { + registerHeuristic(createTopDepthReduceHeuristic(false)); + registerHeuristic(createTopPathReduceHeuristic(false)); + registerHeuristic(createBotHeightReduceHeuristic(false)); + registerHeuristic(createBotPathReduceHeuristic(false)); + } +} + // Setup the register pressure trackers for the top scheduled and bottom // scheduled regions. void ScheduleDAGMILive::initRegPressure() { @@ -1220,6 +1263,8 @@ postprocessDAG(); + registerHeuristics(); + SmallVector TopRoots, BotRoots; findRootsAndBiasEdges(TopRoots, BotRoots); @@ -1476,7 +1521,353 @@ } } } +//===----------------------------------------------------------------------===// +// SchedHeuristic - Predefined scheduling heuristics +//===----------------------------------------------------------------------===// +namespace llvm { +// Bias PhysReg Defs and copies to their uses and defined respectively. +SchedHeuristic *createPhysRegHeuristic() { + static SchedHeuristic PhysReg("PYHS-REG"); + PhysReg.installTryFn([](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + return tryLess(biasPhysReg(TryCand.SU, TryCand.AtTop), + biasPhysReg(Cand.SU, Cand.AtTop), Cand, TryCand); + }); + return &PhysReg; +} + +// Fall through to original instruction order. +SchedHeuristic *createNodeOrderHeuristic() { + static SchedHeuristic NodeOrder("ORDER"); + NodeOrder.installTryFn([](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone) + return nullptr; + if (Zone->isTop()) + return tryLess(TryCand.SU->NodeNum, Cand.SU->NodeNum, TryCand, Cand); + return tryLess(TryCand.SU->NodeNum, Cand.SU->NodeNum, Cand, TryCand); + }); + return &NodeOrder; +} + +// Weak edges are for clustering and other constraints. +SchedHeuristic *createWeakHeuristic() { + static SchedHeuristic Weak("WEAK"); + Weak.installTryFn([](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone) + return nullptr; + return tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop), + getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand); + }); + return &Weak; +} + +// Prioritize instructions that read unbuffered resources by stall cycles. +SchedHeuristic *createStallHeuristic() { + static SchedHeuristic Stall("STALL"); + Stall.installTryFn([](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone) + return nullptr; + return tryLess(Zone->getLatencyStallCycles(TryCand.SU), + Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand); + }); + return &Stall; +} + +// Keep clustered nodes together to encourage downstream peephole +// optimizations which may reduce resource requirements. +// This is a best effort to set things up for a post-RA pass. +// Optimizations like generating loads of multiple registers should ideally +// be done within the scheduler pass by combining the loads during DAG +// postprocessing. +SchedHeuristic *createClusterHeuristic(ScheduleDAGMI *DAG) { + static SchedHeuristic Cluster("CLUSTER"); + Cluster.installTryFn([DAG](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + const SUnit *CandNextClusterSU = + Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); + const SUnit *TryCandNextClusterSU = + TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); + return tryLess(TryCand.SU == TryCandNextClusterSU, + Cand.SU == CandNextClusterSU, Cand, TryCand); + }); + return &Cluster; +} + +// For loops that are acyclic path limited, aggressively schedule for latency. +// Within an single cycle, whenever CurrMOps > 0, allow normal heuristics to +// take precedence. +// Prefer the candidate with the lesser depth, but only if one of them has +// depth greater than the total latency scheduled so far, otherwise either of +// them could be scheduled now with no stall. +SchedHeuristic *createTopDepthReduceHeuristic(bool PreRA) { + static SchedHeuristic TopDepthReducePreRA("TOP-DEPTH"), + TopDepthReducePostRA("TOP-DEPTH"); + auto PreRAFn = [](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone || !Zone->isTop()) + return nullptr; + if (!Zone->Rem->IsAcyclicLatencyLimited || Zone->getCurrMOps()) + return nullptr; + if (std::max(TryCand.SU->getDepth(), Cand.SU->getDepth()) <= + Zone->getScheduledLatency()) + return nullptr; + return tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(), TryCand, Cand); + }; + auto PostRAFn = [](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone || !Zone->isTop()) + return nullptr; + if (!TryCand.Policy.ReduceLatency || Zone->Rem->IsAcyclicLatencyLimited) + return nullptr; + if (std::max(TryCand.SU->getDepth(), Cand.SU->getDepth()) <= + Zone->getScheduledLatency()) + return nullptr; + return tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(), TryCand, Cand); + }; + + TopDepthReducePreRA.installTryFn(PreRAFn); + TopDepthReducePostRA.installTryFn(PostRAFn); +#ifndef NDEBUG + auto Trace = [](const SchedCandidate &Cand) { + unsigned Latency = Cand.SU->getDepth(); + if (Latency) + dbgs() << Latency << " cycles "; + }; + TopDepthReducePreRA.installTraceFn(Trace); + TopDepthReducePostRA.installTraceFn(Trace); +#endif + return PreRA ? &TopDepthReducePreRA : &TopDepthReducePostRA; +} + +SchedHeuristic *createTopPathReduceHeuristic(bool PreRA) { + static SchedHeuristic TopPathReducePreRA("TOP-PATH"), + TopPathReducePostRA("TOP-PATH"); + auto PreRAFn = [](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone || !Zone->isTop()) + return nullptr; + if (!Zone->Rem->IsAcyclicLatencyLimited || Zone->getCurrMOps()) + return nullptr; + return tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(), Cand, + TryCand); + }; + auto PostRAFn = [](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone || !Zone->isTop()) + return nullptr; + if (!TryCand.Policy.ReduceLatency || Zone->Rem->IsAcyclicLatencyLimited) + return nullptr; + return tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(), Cand, + TryCand); + }; + + TopPathReducePreRA.installTryFn(PreRAFn); + TopPathReducePostRA.installTryFn(PostRAFn); +#ifndef NDEBUG + auto Trace = [](const SchedCandidate &Cand) { + unsigned Latency = Cand.SU->getHeight(); + if (Latency) + dbgs() << Latency << " cycles "; + }; + TopPathReducePreRA.installTraceFn(Trace); + TopPathReducePostRA.installTraceFn(Trace); +#endif + return PreRA ? &TopPathReducePreRA : &TopPathReducePostRA; +} + +// Prefer the candidate with the lesser height, but only if one of them +// has height greater than the total latency scheduled so far, otherwise +// either of them could be scheduled now with no stall. +SchedHeuristic *createBotHeightReduceHeuristic(bool PreRA) { + static SchedHeuristic BotHeightPreRA("BOT-HEIGHT"), + BotHeightPostRA("BOT-HEIGHT"); + auto PreRAFn = [](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone || Zone->isTop()) + return nullptr; + if (!Zone->Rem->IsAcyclicLatencyLimited || Zone->getCurrMOps()) + return nullptr; + if (std::max(TryCand.SU->getHeight(), Cand.SU->getHeight()) <= + Zone->getScheduledLatency()) + return nullptr; + return tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(), TryCand, + Cand); + }; + auto PostRAFn = [](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone || Zone->isTop()) + return nullptr; + if (!TryCand.Policy.ReduceLatency || Zone->Rem->IsAcyclicLatencyLimited) + return nullptr; + if (std::max(TryCand.SU->getHeight(), Cand.SU->getHeight()) <= + Zone->getScheduledLatency()) + return nullptr; + return tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(), TryCand, + Cand); + }; + + BotHeightPreRA.installTryFn(PreRAFn); + BotHeightPostRA.installTryFn(PostRAFn); +#ifndef NDEBUG + auto Trace = [](const SchedCandidate &Cand) { + unsigned Latency = Cand.SU->getHeight(); + if (Latency) + dbgs() << Latency << " cycles "; + }; + BotHeightPreRA.installTraceFn(Trace); + BotHeightPostRA.installTraceFn(Trace); +#endif + return PreRA ? &BotHeightPreRA : &BotHeightPostRA; +} + +SchedHeuristic *createBotPathReduceHeuristic(bool PreRA) { + static SchedHeuristic BotPathPreRA("BOT-PATH"), BotPathPostRA("BOT-PATH"); + auto PreRAFn = [](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone || Zone->isTop()) + return nullptr; + if (!Zone->Rem->IsAcyclicLatencyLimited || Zone->getCurrMOps()) + return nullptr; + return tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(), Cand, TryCand); + }; + auto PostRAFn = [](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone || Zone->isTop()) + return nullptr; + if (!TryCand.Policy.ReduceLatency || Zone->Rem->IsAcyclicLatencyLimited) + return nullptr; + return tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(), Cand, TryCand); + }; + + BotPathPreRA.installTryFn(PreRAFn); + BotPathPostRA.installTryFn(PostRAFn); +#ifndef NDEBUG + auto Trace = [](const SchedCandidate &Cand) { + unsigned Latency = Cand.SU->getDepth(); + if (Latency) + dbgs() << Latency << " cycles "; + }; + BotPathPreRA.installTraceFn(Trace); + BotPathPostRA.installTraceFn(Trace); +#endif + return PreRA ? &BotPathPreRA : &BotPathPostRA; +} + +// Avoid critical resource consumption and balance the schedule. +SchedHeuristic *createResourceReduceHeuristic(ScheduleDAGMI *DAG, bool PreRA) { + static SchedHeuristic ResReducePreRA("RES-REDUCE"), + ResReducePostRA("RES-REDUCE"); + auto PreRAFn = [DAG](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone) + return nullptr; + TryCand.initResourceDelta(DAG, Zone->SchedModel); + return tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, + TryCand, Cand); + }; + auto PostRAFn = [](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone) + return nullptr; + return tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, + TryCand, Cand); + }; + + ResReducePreRA.installTryFn(PreRAFn); + ResReducePostRA.installTryFn(PostRAFn); +#ifndef NDEBUG + auto Trace = [DAG](const SchedCandidate &Cand) { + unsigned ResIdx = Cand.Policy.ReduceResIdx; + if (ResIdx) + dbgs() << DAG->getSchedModel()->getProcResource(ResIdx)->Name; + }; + ResReducePreRA.installTraceFn(Trace); + ResReducePostRA.installTraceFn(Trace); +#endif + return PreRA ? &ResReducePreRA : &ResReducePostRA; +} + +SchedHeuristic *createResourceDemandHeuristic(ScheduleDAGMI *DAG) { + static SchedHeuristic ResDemand("RES-DEMAND"); + ResDemand.installTryFn([](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone) + return nullptr; + return tryLess(TryCand.ResDelta.DemandedResources, + Cand.ResDelta.DemandedResources, Cand, TryCand); + }); +#ifndef NDEBUG + ResDemand.installTraceFn([DAG](const SchedCandidate &Cand) { + unsigned ResIdx = Cand.Policy.DemandResIdx; + if (ResIdx) + dbgs() << DAG->getSchedModel()->getProcResource(ResIdx)->Name; + }); +#endif + return &ResDemand; +} + +// Avoid exceeding the target's limit. +SchedHeuristic *createRegExcessHeuristic(ScheduleDAGMILive *DAG) { + static SchedHeuristic RegExcess("REG-EXCESS"); + RegExcess.installTryFn([DAG](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + return tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, + Cand, DAG->TRI, DAG->MF); + }); +#ifndef NDEBUG + auto Trace = [DAG](const SchedCandidate &Cand) { + PressureChange P = Cand.RPDelta.Excess; + if (P.isValid()) + dbgs() << DAG->TRI->getRegPressureSetName(P.getPSet()) << ":" + << P.getUnitInc(); + }; + RegExcess.installTraceFn(Trace); +#endif + return &RegExcess; +} +// Avoid increasing the max critical pressure in the scheduled region. +SchedHeuristic *createRegCritMaxHeuristic(ScheduleDAGMILive *DAG) { + static SchedHeuristic RegCrit("REG-CRIT"); + RegCrit.installTryFn([DAG](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + return tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, + TryCand, Cand, DAG->TRI, DAG->MF); + }); +#ifndef NDEBUG + RegCrit.installTraceFn([DAG](const SchedCandidate &Cand) { + PressureChange P = Cand.RPDelta.CriticalMax; + if (P.isValid()) + dbgs() << DAG->TRI->getRegPressureSetName(P.getPSet()) << ":" + << P.getUnitInc(); + }); +#endif + return &RegCrit; +} + +// Avoid increasing the max pressure of the entire region. +SchedHeuristic *createRegMaxHeuristic(ScheduleDAGMILive *DAG) { + static SchedHeuristic RegMax("REG-MAX"); + RegMax.installTryFn([DAG](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + return tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, + TryCand, Cand, DAG->TRI, DAG->MF); + }); +#ifndef NDEBUG + RegMax.installTraceFn([DAG](const SchedCandidate &Cand) { + PressureChange P = Cand.RPDelta.CurrentMax; + if (P.isValid()) + dbgs() << DAG->TRI->getRegPressureSetName(P.getPSet()) << ":" + << P.getUnitInc(); + }); +#endif + return &RegMax; +} + +} // end namespace llvm //===----------------------------------------------------------------------===// // BaseMemOpClusterMutation - DAG post-processing to cluster loads or stores. //===----------------------------------------------------------------------===// @@ -2700,162 +3091,132 @@ Policy.DemandResIdx = OtherCritIdx; } -#ifndef NDEBUG -const char *GenericSchedulerBase::getReasonStr( - GenericSchedulerBase::CandReason Reason) { - switch (Reason) { - case NoCand: return "NOCAND "; - case Only1: return "ONLY1 "; - case PhysReg: return "PHYS-REG "; - case RegExcess: return "REG-EXCESS"; - case RegCritical: return "REG-CRIT "; - case Stall: return "STALL "; - case Cluster: return "CLUSTER "; - case Weak: return "WEAK "; - case RegMax: return "REG-MAX "; - case ResourceReduce: return "RES-REDUCE"; - case ResourceDemand: return "RES-DEMAND"; - case TopDepthReduce: return "TOP-DEPTH "; - case TopPathReduce: return "TOP-PATH "; - case BotHeightReduce:return "BOT-HEIGHT"; - case BotPathReduce: return "BOT-PATH "; - case NextDefUse: return "DEF-USE "; - case NodeOrder: return "ORDER "; - }; - llvm_unreachable("Unknown reason!"); +bool GenericSchedulerBase::hasHigherPriority( + const SchedHeuristic *Reason1, const SchedHeuristic *Reason2) const { + if (!Reason1) + return false; + if (!Reason2) + return true; + assert(Reason1->Priority >= 0 && Reason2->Priority >= 0 && + "Uninitialized heuristic priority"); + return Reason1->Priority < Reason2->Priority; } -void GenericSchedulerBase::traceCandidate(const SchedCandidate &Cand) { - PressureChange P; - unsigned ResIdx = 0; - unsigned Latency = 0; - switch (Cand.Reason) { - default: - break; - case RegExcess: - P = Cand.RPDelta.Excess; - break; - case RegCritical: - P = Cand.RPDelta.CriticalMax; - break; - case RegMax: - P = Cand.RPDelta.CurrentMax; - break; - case ResourceReduce: - ResIdx = Cand.Policy.ReduceResIdx; - break; - case ResourceDemand: - ResIdx = Cand.Policy.DemandResIdx; - break; - case TopDepthReduce: - Latency = Cand.SU->getDepth(); - break; - case TopPathReduce: - Latency = Cand.SU->getHeight(); - break; - case BotHeightReduce: - Latency = Cand.SU->getHeight(); - break; - case BotPathReduce: - Latency = Cand.SU->getDepth(); - break; - } - dbgs() << " Cand SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason); - if (P.isValid()) - dbgs() << " " << TRI->getRegPressureSetName(P.getPSet()) - << ":" << P.getUnitInc() << " "; - else - dbgs() << " "; - if (ResIdx) - dbgs() << " " << SchedModel->getProcResource(ResIdx)->Name << " "; - else - dbgs() << " "; - if (Latency) - dbgs() << " " << Latency << " cycles "; - else - dbgs() << " "; - dbgs() << '\n'; +void GenericSchedulerBase::schedNode(SUnit *SU, bool IsTop) { + // Notify the heuristics to update its internal state after the SU has been + // scheduled. + for (auto *Heuristic : Heuristics) + Heuristic->update(*SU, IsTop); } -#endif -namespace llvm { -/// Return true if this heuristic determines order. -bool tryLess(int TryVal, int CandVal, - GenericSchedulerBase::SchedCandidate &TryCand, - GenericSchedulerBase::SchedCandidate &Cand, - GenericSchedulerBase::CandReason Reason) { - if (TryVal < CandVal) { - TryCand.Reason = Reason; - return true; - } - if (TryVal > CandVal) { - if (Cand.Reason > Reason) - Cand.Reason = Reason; - return true; - } - return false; +void GenericSchedulerBase::initPolicy(MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + unsigned NumRegionInstrs) { + // Clear the heuristics when entering the region as we will register all + // the heuristics basing on the region policy later. Keep the NodeOrder as the + // default heuristic. + Heuristics.clear(); + Heuristics.push_back(createNodeOrderHeuristic()); } -bool tryGreater(int TryVal, int CandVal, - GenericSchedulerBase::SchedCandidate &TryCand, - GenericSchedulerBase::SchedCandidate &Cand, - GenericSchedulerBase::CandReason Reason) { - if (TryVal > CandVal) { - TryCand.Reason = Reason; - return true; - } - if (TryVal < CandVal) { - if (Cand.Reason > Reason) - Cand.Reason = Reason; - return true; +void GenericSchedulerBase::registerHeuristic(SchedHeuristic &Heuristic, + SchedHeuristic *Before) { + assert(!std::count(Heuristics.begin(), Heuristics.end(), &Heuristic) && + "Heuristic is registered multiple times"); + assert(Heuristics.size() > 0 && + Heuristics[Heuristics.size() - 1] == createNodeOrderHeuristic() && + "NodeOrder heuristic is missing"); + if (!Before) + Before = Heuristics[Heuristics.size() - 1]; + + auto *Pos = std::find(Heuristics.begin(), Heuristics.end(), Before); + assert(Pos != Heuristics.end() && + "The heuristic to register before hasn't been registered"); + Heuristics.insert(Pos, &Heuristic); +} + +void GenericSchedulerBase::initialize(ScheduleDAGMI *DAG) { + if (Heuristics.size()) + LLVM_DEBUG(dbgs() << "Heuristics: "); + + // Assign the priority for each heuristics and initialize the heuristic. + for (unsigned Idx = 0; Idx < Heuristics.size(); ++Idx) { + LLVM_DEBUG(dbgs() << Heuristics[Idx]->Name; + if (Idx != Heuristics.size() - 1) dbgs() << " > "); + Heuristics[Idx]->Priority = Idx; + Heuristics[Idx]->initialize(DAG); } - return false; + + if (Heuristics.size()) + LLVM_DEBUG(dbgs() << '\n'); } -bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand, - GenericSchedulerBase::SchedCandidate &Cand, - SchedBoundary &Zone) { - if (Zone.isTop()) { - // Prefer the candidate with the lesser depth, but only if one of them has - // depth greater than the total latency scheduled so far, otherwise either - // of them could be scheduled now with no stall. - if (std::max(TryCand.SU->getDepth(), Cand.SU->getDepth()) > - Zone.getScheduledLatency()) { - if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(), - TryCand, Cand, GenericSchedulerBase::TopDepthReduce)) - return true; - } - if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(), - TryCand, Cand, GenericSchedulerBase::TopPathReduce)) - return true; - } else { - // Prefer the candidate with the lesser height, but only if one of them has - // height greater than the total latency scheduled so far, otherwise either - // of them could be scheduled now with no stall. - if (std::max(TryCand.SU->getHeight(), Cand.SU->getHeight()) > - Zone.getScheduledLatency()) { - if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(), - TryCand, Cand, GenericSchedulerBase::BotHeightReduce)) - return true; +/// Apply a set of heuristics to a new candidate. Heuristics are currently +/// hierarchical. This may be more efficient than a graduated cost model because +/// we don't need to evaluate all aspects of the model for each node in the +/// queue. But it's really done to make the heuristics easier to debug and +/// statistically analyze. +/// +/// \param Cand provides the policy and current best candidate. +/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized. +/// \param Zone describes the scheduled zone that we are extending, or nullptr +// if Cand is from a different zone than TryCand. +void GenericSchedulerBase::tryCandidate(SchedCandidate &Cand, + SchedCandidate &TryCand, + SchedBoundary *Zone) const { + // Initialize the candidate if needed. + if (!Cand.isValid()) { + TryCand.Reason = createNodeOrderHeuristic(); + return; + } + + for (auto *Reason : Heuristics) { + if (SchedCandidate *SC = Reason->tryCandidate(Cand, TryCand, Zone)) { + assert((SC == &Cand || SC == &TryCand) && + "Either Cand or TryCand is expected"); + if (SC == &TryCand || hasHigherPriority(Reason, SC->Reason)) + SC->Reason = Reason; + break; } - if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(), - TryCand, Cand, GenericSchedulerBase::BotPathReduce)) - return true; } - return false; } + +#ifndef NDEBUG +void GenericSchedulerBase::traceCandidate(const SchedCandidate &Cand) { + dbgs() << " Cand SU(" << Cand.SU->NodeNum << ") " + << (Cand.Reason ? Cand.Reason->name() : "NOCAND") << " "; + if (Cand.Reason) + Cand.Reason->trace(Cand); + + LLVM_DEBUG(dbgs() << '\n'); +} +#endif + +namespace llvm { +/// Return the candidate if this heuristic determines order. +GenericSchedulerBase::SchedCandidate * +tryLess(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, + GenericSchedulerBase::SchedCandidate &Cand) { + if (TryVal < CandVal) + return &TryCand; + if (TryVal > CandVal) + return &Cand; + return nullptr; +} + } // end namespace llvm -static void tracePick(GenericSchedulerBase::CandReason Reason, bool IsTop) { - LLVM_DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ") - << GenericSchedulerBase::getReasonStr(Reason) << '\n'); +static void tracePick(StringRef Name, bool IsTop) { + LLVM_DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ") << Name << '\n'); } static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand) { - tracePick(Cand.Reason, Cand.AtTop); + tracePick(Cand.Reason ? Cand.Reason->name() : "NOCAND", Cand.AtTop); } void GenericScheduler::initialize(ScheduleDAGMI *dag) { + GenericSchedulerBase::initialize(dag); + assert(dag->hasVRegLiveness() && "(PreRA)GenericScheduler needs vreg liveness"); DAG = static_cast(dag); @@ -2892,6 +3253,8 @@ void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, unsigned NumRegionInstrs) { + GenericSchedulerBase::initPolicy(Begin, End, NumRegionInstrs); + const MachineFunction &MF = *Begin->getMF(); const TargetLowering *TLI = MF.getSubtarget().getTargetLowering(); @@ -3005,43 +3368,40 @@ } namespace llvm { -bool tryPressure(const PressureChange &TryP, - const PressureChange &CandP, - GenericSchedulerBase::SchedCandidate &TryCand, - GenericSchedulerBase::SchedCandidate &Cand, - GenericSchedulerBase::CandReason Reason, - const TargetRegisterInfo *TRI, - const MachineFunction &MF) { +GenericSchedulerBase::SchedCandidate * +tryPressure(const PressureChange &TryP, const PressureChange &CandP, + GenericSchedulerBase::SchedCandidate &TryCand, + GenericSchedulerBase::SchedCandidate &Cand, + const TargetRegisterInfo *TRI, const MachineFunction &MF) { // If one candidate decreases and the other increases, go with it. // Invalid candidates have UnitInc==0. - if (tryGreater(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand, - Reason)) { - return true; - } + if (GenericSchedulerBase::SchedCandidate *SC = + tryLess(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, Cand, TryCand)) + return SC; + // Do not compare the magnitude of pressure changes between top and bottom // boundary. if (Cand.AtTop != TryCand.AtTop) - return false; + return nullptr; // If both candidates affect the same set in the same boundary, go with the // smallest increase. unsigned TryPSet = TryP.getPSetOrMax(); unsigned CandPSet = CandP.getPSetOrMax(); if (TryPSet == CandPSet) { - return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand, - Reason); + return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand); } - int TryRank = TryP.isValid() ? TRI->getRegPressureSetScore(MF, TryPSet) : - std::numeric_limits::max(); + int TryRank = TryP.isValid() ? TRI->getRegPressureSetScore(MF, TryPSet) + : std::numeric_limits::max(); - int CandRank = CandP.isValid() ? TRI->getRegPressureSetScore(MF, CandPSet) : - std::numeric_limits::max(); + int CandRank = CandP.isValid() ? TRI->getRegPressureSetScore(MF, CandPSet) + : std::numeric_limits::max(); // If the candidates are decreasing pressure, reverse priority. if (TryP.getUnitInc() < 0) std::swap(TryRank, CandRank); - return tryGreater(TryRank, CandRank, TryCand, Cand, Reason); + return tryLess(TryRank, CandRank, Cand, TryCand); } unsigned getWeakLeft(const SUnit *SU, bool isTop) { @@ -3129,119 +3489,6 @@ << Cand.RPDelta.Excess.getUnitInc() << "\n"); } -/// Apply a set of heuristics to a new candidate. Heuristics are currently -/// hierarchical. This may be more efficient than a graduated cost model because -/// we don't need to evaluate all aspects of the model for each node in the -/// queue. But it's really done to make the heuristics easier to debug and -/// statistically analyze. -/// -/// \param Cand provides the policy and current best candidate. -/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized. -/// \param Zone describes the scheduled zone that we are extending, or nullptr -// if Cand is from a different zone than TryCand. -void GenericScheduler::tryCandidate(SchedCandidate &Cand, - SchedCandidate &TryCand, - SchedBoundary *Zone) const { - // Initialize the candidate if needed. - if (!Cand.isValid()) { - TryCand.Reason = NodeOrder; - return; - } - - // Bias PhysReg Defs and copies to their uses and defined respectively. - if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop), - biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg)) - return; - - // Avoid exceeding the target's limit. - if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.Excess, - Cand.RPDelta.Excess, - TryCand, Cand, RegExcess, TRI, - DAG->MF)) - return; - - // Avoid increasing the max critical pressure in the scheduled region. - if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CriticalMax, - Cand.RPDelta.CriticalMax, - TryCand, Cand, RegCritical, TRI, - DAG->MF)) - return; - - // We only compare a subset of features when comparing nodes between - // Top and Bottom boundary. Some properties are simply incomparable, in many - // other instances we should only override the other boundary if something - // is a clear good pick on one boundary. Skip heuristics that are more - // "tie-breaking" in nature. - bool SameBoundary = Zone != nullptr; - if (SameBoundary) { - // For loops that are acyclic path limited, aggressively schedule for - // latency. Within an single cycle, whenever CurrMOps > 0, allow normal - // heuristics to take precedence. - if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() && - tryLatency(TryCand, Cand, *Zone)) - return; - - // Prioritize instructions that read unbuffered resources by stall cycles. - if (tryLess(Zone->getLatencyStallCycles(TryCand.SU), - Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) - return; - } - - // Keep clustered nodes together to encourage downstream peephole - // optimizations which may reduce resource requirements. - // - // This is a best effort to set things up for a post-RA pass. Optimizations - // like generating loads of multiple registers should ideally be done within - // the scheduler pass by combining the loads during DAG postprocessing. - const SUnit *CandNextClusterSU = - Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); - const SUnit *TryCandNextClusterSU = - TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); - if (tryGreater(TryCand.SU == TryCandNextClusterSU, - Cand.SU == CandNextClusterSU, - TryCand, Cand, Cluster)) - return; - - if (SameBoundary) { - // Weak edges are for clustering and other constraints. - if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop), - getWeakLeft(Cand.SU, Cand.AtTop), - TryCand, Cand, Weak)) - return; - } - - // Avoid increasing the max pressure of the entire region. - if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax, - Cand.RPDelta.CurrentMax, - TryCand, Cand, RegMax, TRI, - DAG->MF)) - return; - - if (SameBoundary) { - // Avoid critical resource consumption and balance the schedule. - TryCand.initResourceDelta(DAG, SchedModel); - if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, - TryCand, Cand, ResourceReduce)) - return; - if (tryGreater(TryCand.ResDelta.DemandedResources, - Cand.ResDelta.DemandedResources, - TryCand, Cand, ResourceDemand)) - return; - - // Avoid serializing long latency dependence chains. - // For acyclic path limited loops, latency was already checked above. - if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency && - !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone)) - return; - - // Fall through to original instruction order. - if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) - || (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { - TryCand.Reason = NodeOrder; - } - } -} - /// Pick the best candidate from the queue. /// /// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during @@ -3262,7 +3509,7 @@ // Pass SchedBoundary only when comparing nodes from the same boundary. SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; tryCandidate(Cand, TryCand, ZoneArg); - if (TryCand.Reason != NoCand) { + if (TryCand.Reason) { // Initialize resource delta if needed in case future heuristics query it. if (TryCand.ResDelta == SchedResourceDelta()) TryCand.initResourceDelta(DAG, SchedModel); @@ -3278,12 +3525,12 @@ // efficient, but also provides the best heuristics for CriticalPSets. if (SUnit *SU = Bot.pickOnlyChoice()) { IsTopNode = false; - tracePick(Only1, false); + tracePick("ONLY1", false); return SU; } if (SUnit *SU = Top.pickOnlyChoice()) { IsTopNode = true; - tracePick(Only1, true); + tracePick("ONLY1", true); return SU; } // Set the bottom-up policy based on the state of the current bottom zone and @@ -3301,7 +3548,7 @@ BotCand.Policy != BotPolicy) { BotCand.reset(CandPolicy()); pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand); - assert(BotCand.Reason != NoCand && "failed to find the first candidate"); + assert(BotCand.Reason && "failed to find the first candidate"); } else { LLVM_DEBUG(traceCandidate(BotCand)); #ifndef NDEBUG @@ -3321,7 +3568,7 @@ TopCand.Policy != TopPolicy) { TopCand.reset(CandPolicy()); pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand); - assert(TopCand.Reason != NoCand && "failed to find the first candidate"); + assert(TopCand.Reason && "failed to find the first candidate"); } else { LLVM_DEBUG(traceCandidate(TopCand)); #ifndef NDEBUG @@ -3339,9 +3586,9 @@ assert(BotCand.isValid()); assert(TopCand.isValid()); SchedCandidate Cand = BotCand; - TopCand.Reason = NoCand; + TopCand.Reason = nullptr; tryCandidate(Cand, TopCand, nullptr); - if (TopCand.Reason != NoCand) { + if (TopCand.Reason) { Cand.setBest(TopCand); LLVM_DEBUG(traceCandidate(Cand)); } @@ -3366,7 +3613,7 @@ CandPolicy NoPolicy; TopCand.reset(NoPolicy); pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand); - assert(TopCand.Reason != NoCand && "failed to find a candidate"); + assert(TopCand.Reason && "failed to find a candidate"); tracePick(TopCand); SU = TopCand.SU; } @@ -3377,7 +3624,7 @@ CandPolicy NoPolicy; BotCand.reset(NoPolicy); pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand); - assert(BotCand.Reason != NoCand && "failed to find a candidate"); + assert(BotCand.Reason && "failed to find a candidate"); tracePick(BotCand); SU = BotCand.SU; } @@ -3440,6 +3687,7 @@ if (SU->hasPhysRegDefs) reschedulePhysReg(SU, false); } + GenericSchedulerBase::schedNode(SU, IsTopNode); } /// Create the standard converging machine scheduler. This will be used as the @@ -3469,6 +3717,8 @@ //===----------------------------------------------------------------------===// void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) { + GenericSchedulerBase::initialize(Dag); + DAG = Dag; SchedModel = DAG->getSchedModel(); TRI = DAG->TRI; @@ -3501,48 +3751,6 @@ } } -/// Apply a set of heuristics to a new candidate for PostRA scheduling. -/// -/// \param Cand provides the policy and current best candidate. -/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized. -void PostGenericScheduler::tryCandidate(SchedCandidate &Cand, - SchedCandidate &TryCand) { - // Initialize the candidate if needed. - if (!Cand.isValid()) { - TryCand.Reason = NodeOrder; - return; - } - - // Prioritize instructions that read unbuffered resources by stall cycles. - if (tryLess(Top.getLatencyStallCycles(TryCand.SU), - Top.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) - return; - - // Keep clustered nodes together. - if (tryGreater(TryCand.SU == DAG->getNextClusterSucc(), - Cand.SU == DAG->getNextClusterSucc(), - TryCand, Cand, Cluster)) - return; - - // Avoid critical resource consumption and balance the schedule. - if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, - TryCand, Cand, ResourceReduce)) - return; - if (tryGreater(TryCand.ResDelta.DemandedResources, - Cand.ResDelta.DemandedResources, - TryCand, Cand, ResourceDemand)) - return; - - // Avoid serializing long latency dependence chains. - if (Cand.Policy.ReduceLatency && tryLatency(TryCand, Cand, Top)) { - return; - } - - // Fall through to original instruction order. - if (TryCand.SU->NodeNum < Cand.SU->NodeNum) - TryCand.Reason = NodeOrder; -} - void PostGenericScheduler::pickNodeFromQueue(SchedCandidate &Cand) { ReadyQueue &Q = Top.Available; for (SUnit *SU : Q) { @@ -3550,8 +3758,8 @@ TryCand.SU = SU; TryCand.AtTop = true; TryCand.initResourceDelta(DAG, SchedModel); - tryCandidate(Cand, TryCand); - if (TryCand.Reason != NoCand) { + tryCandidate(Cand, TryCand, &Top); + if (TryCand.Reason) { Cand.setBest(TryCand); LLVM_DEBUG(traceCandidate(Cand)); } @@ -3568,7 +3776,7 @@ do { SU = Top.pickOnlyChoice(); if (SU) { - tracePick(Only1, true); + tracePick("ONLY1", true); } else { CandPolicy NoPolicy; SchedCandidate TopCand(NoPolicy); @@ -3576,7 +3784,7 @@ // the instructions outside the zone, including the bottom zone. setPolicy(TopCand.Policy, /*IsPostRA=*/true, Top, nullptr); pickNodeFromQueue(TopCand); - assert(TopCand.Reason != NoCand && "failed to find a candidate"); + assert(TopCand.Reason && "failed to find a candidate"); tracePick(TopCand); SU = TopCand.SU; } @@ -3595,6 +3803,8 @@ void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) { SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle()); Top.bumpNode(SU); + + GenericSchedulerBase::schedNode(SU, IsTopNode); } ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) { diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -153,7 +153,7 @@ // Pass SchedBoundary only when comparing nodes from the same boundary. SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; GenericScheduler::tryCandidate(Cand, TryCand, ZoneArg); - if (TryCand.Reason != NoCand) { + if (TryCand.Reason) { // Initialize resource delta if needed in case future heuristics query it. if (TryCand.ResDelta == SchedResourceDelta()) TryCand.initResourceDelta(Zone.DAG, SchedModel); @@ -191,7 +191,7 @@ BotCand.Policy != BotPolicy) { BotCand.reset(CandPolicy()); pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand); - assert(BotCand.Reason != NoCand && "failed to find the first candidate"); + assert(BotCand.Reason && "failed to find the first candidate"); } else { LLVM_DEBUG(traceCandidate(BotCand)); #ifndef NDEBUG @@ -211,7 +211,7 @@ TopCand.Policy != TopPolicy) { TopCand.reset(CandPolicy()); pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand); - assert(TopCand.Reason != NoCand && "failed to find the first candidate"); + assert(TopCand.Reason && "failed to find the first candidate"); } else { LLVM_DEBUG(traceCandidate(TopCand)); #ifndef NDEBUG @@ -229,9 +229,9 @@ LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand); dbgs() << "Bot Cand: "; traceCandidate(BotCand);); SchedCandidate Cand = BotCand; - TopCand.Reason = NoCand; + TopCand.Reason = nullptr; GenericScheduler::tryCandidate(Cand, TopCand, nullptr); - if (TopCand.Reason != NoCand) { + if (TopCand.Reason) { Cand.setBest(TopCand); } LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand);); @@ -256,7 +256,7 @@ CandPolicy NoPolicy; TopCand.reset(NoPolicy); pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand); - assert(TopCand.Reason != NoCand && "failed to find a candidate"); + assert(TopCand.Reason && "failed to find a candidate"); SU = TopCand.SU; } IsTopNode = true; @@ -266,7 +266,7 @@ CandPolicy NoPolicy; BotCand.reset(NoPolicy); pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand); - assert(BotCand.Reason != NoCand && "failed to find a candidate"); + assert(BotCand.Reason && "failed to find a candidate"); SU = BotCand.SU; } IsTopNode = false; diff --git a/llvm/lib/Target/PowerPC/PPCMachineScheduler.h b/llvm/lib/Target/PowerPC/PPCMachineScheduler.h --- a/llvm/lib/Target/PowerPC/PPCMachineScheduler.h +++ b/llvm/lib/Target/PowerPC/PPCMachineScheduler.h @@ -17,34 +17,27 @@ namespace llvm { -/// A MachineSchedStrategy implementation for PowerPC pre RA scheduling. -class PPCPreRASchedStrategy : public GenericScheduler { +/// A Pre RA machine scheduler implementation for PowerPC target. +class PPCPreRAMachineScheduler : public ScheduleDAGMILive { public: - PPCPreRASchedStrategy(const MachineSchedContext *C) : - GenericScheduler(C) {} + PPCPreRAMachineScheduler(MachineSchedContext *C, + std::unique_ptr S) + : ScheduleDAGMILive(C, std::move(S)) {} + protected: - void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, - SchedBoundary *Zone) const override; -private: - bool biasAddiLoadCandidate(SchedCandidate &Cand, - SchedCandidate &TryCand, - SchedBoundary &Zone) const; + void registerHeuristics() override; }; -/// A MachineSchedStrategy implementation for PowerPC post RA scheduling. -class PPCPostRASchedStrategy : public PostGenericScheduler { +/// A Post RA machine scheduler implementation for PowerPC target. +class PPCPostRAMachineScheduler : public ScheduleDAGMI { public: - PPCPostRASchedStrategy(const MachineSchedContext *C) : - PostGenericScheduler(C) {} + PPCPostRAMachineScheduler(MachineSchedContext *C, + std::unique_ptr S, + bool RemoveKillFlags) + : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {} protected: - void initialize(ScheduleDAGMI *Dag) override; - SUnit *pickNode(bool &IsTopNode) override; - void enterMBB(MachineBasicBlock *MBB) override; - void leaveMBB() override; - - void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand) override; - bool biasAddiCandidate(SchedCandidate &Cand, SchedCandidate &TryCand) const; + void registerHeuristics() override; }; } // end namespace llvm diff --git a/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp b/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp --- a/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp +++ b/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp @@ -8,6 +8,7 @@ #include "PPCMachineScheduler.h" #include "MCTargetDesc/PPCMCTargetDesc.h" +#include "PPCSubtarget.h" using namespace llvm; @@ -26,94 +27,68 @@ Cand.SU->getInstr()->getOpcode() == PPC::ADDI8; } -bool PPCPreRASchedStrategy::biasAddiLoadCandidate(SchedCandidate &Cand, - SchedCandidate &TryCand, - SchedBoundary &Zone) const { - if (DisableAddiLoadHeuristic) - return false; - - SchedCandidate &FirstCand = Zone.isTop() ? TryCand : Cand; - SchedCandidate &SecondCand = Zone.isTop() ? Cand : TryCand; - if (isADDIInstr(FirstCand) && SecondCand.SU->getInstr()->mayLoad()) { - TryCand.Reason = Stall; - return true; - } - if (FirstCand.SU->getInstr()->mayLoad() && isADDIInstr(SecondCand)) { - TryCand.Reason = NoCand; - return true; - } - - return false; -} - -void PPCPreRASchedStrategy::tryCandidate(SchedCandidate &Cand, - SchedCandidate &TryCand, - SchedBoundary *Zone) const { - GenericScheduler::tryCandidate(Cand, TryCand, Zone); - - if (!Cand.isValid() || !Zone) - return; - - // Add powerpc specific heuristic only when TryCand isn't selected or - // selected as node order. - if (TryCand.Reason != NodeOrder && TryCand.Reason != NoCand) - return; +//===----------------------------------------------------------------------===// +// PowerPC specific heuristics +//===----------------------------------------------------------------------===// - // There are some benefits to schedule the ADDI before the load to hide the - // latency, as RA may create a true dependency between the load and addi. - if (biasAddiLoadCandidate(Cand, TryCand, *Zone)) - return; +// There are some benefits to schedule the ADDI before the load to hide the +// latency, as RA may create a true dependency between the load and addi. +SchedHeuristic *createBiasAddiLoadHeuristic() { + if (DisableAddiLoadHeuristic) + return nullptr; + + static SchedHeuristic BiasAddiLoad("ADDI-LOAD"); + BiasAddiLoad.installTryFn([](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (!Zone) + return nullptr; + SchedCandidate &FirstCand = Zone->isTop() ? TryCand : Cand; + SchedCandidate &SecondCand = Zone->isTop() ? Cand : TryCand; + if (isADDIInstr(FirstCand) && SecondCand.SU->getInstr()->mayLoad()) + return &TryCand; + if (FirstCand.SU->getInstr()->mayLoad() && isADDIInstr(SecondCand)) + return &Cand; + return nullptr; + }); + return &BiasAddiLoad; } -bool PPCPostRASchedStrategy::biasAddiCandidate(SchedCandidate &Cand, - SchedCandidate &TryCand) const { +// There are some benefits to schedule the ADDI as early as possible post ra +// to avoid stalled by vector instructions which take up all the hw units. +// And ADDI is usually used to post inc the loop indvar, which matters the +// performance. +SchedHeuristic *createBiasAddiHeuristic() { if (!EnableAddiHeuristic) - return false; - - if (isADDIInstr(TryCand) && !isADDIInstr(Cand)) { - TryCand.Reason = Stall; - return true; - } - return false; + return nullptr; + + static SchedHeuristic BiasAddi("BIAS-ADDI"); + BiasAddi.installTryFn([](SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) -> SchedCandidate * { + if (isADDIInstr(TryCand) && !isADDIInstr(Cand)) + return &TryCand; + if (isADDIInstr(Cand) && !isADDIInstr(TryCand)) + return &Cand; + return nullptr; + }); + return &BiasAddi; } -void PPCPostRASchedStrategy::tryCandidate(SchedCandidate &Cand, - SchedCandidate &TryCand) { - PostGenericScheduler::tryCandidate(Cand, TryCand); - - if (!Cand.isValid()) - return; +void PPCPreRAMachineScheduler::registerHeuristics() { + ScheduleDAGMILive::registerHeuristics(); - // Add powerpc post ra specific heuristic only when TryCand isn't selected or - // selected as node order. - if (TryCand.Reason != NodeOrder && TryCand.Reason != NoCand) + const PPCSubtarget &ST = MF.getSubtarget(); + if (!ST.usePPCPreRASchedStrategy()) return; - // There are some benefits to schedule the ADDI as early as possible post ra - // to avoid stalled by vector instructions which take up all the hw units. - // And ADDI is usually used to post inc the loop indvar, which matters the - // performance. - if (biasAddiCandidate(Cand, TryCand)) - return; + registerHeuristic(createBiasAddiLoadHeuristic()); } -void PPCPostRASchedStrategy::enterMBB(MachineBasicBlock *MBB) { - // Custom PPC PostRA specific behavior here. - PostGenericScheduler::enterMBB(MBB); -} +void PPCPostRAMachineScheduler::registerHeuristics() { + ScheduleDAGMI::registerHeuristics(); -void PPCPostRASchedStrategy::leaveMBB() { - // Custom PPC PostRA specific behavior here. - PostGenericScheduler::leaveMBB(); -} - -void PPCPostRASchedStrategy::initialize(ScheduleDAGMI *Dag) { - // Custom PPC PostRA specific initialization here. - PostGenericScheduler::initialize(Dag); -} + const PPCSubtarget &ST = MF.getSubtarget(); + if (!ST.usePPCPostRASchedStrategy()) + return; -SUnit *PPCPostRASchedStrategy::pickNode(bool &IsTopNode) { - // Custom PPC PostRA specific scheduling here. - return PostGenericScheduler::pickNode(IsTopNode); + registerHeuristic(createBiasAddiHeuristic()); } - diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -268,9 +268,7 @@ static ScheduleDAGInstrs *createPPCMachineScheduler(MachineSchedContext *C) { const PPCSubtarget &ST = C->MF->getSubtarget(); ScheduleDAGMILive *DAG = - new ScheduleDAGMILive(C, ST.usePPCPreRASchedStrategy() ? - std::make_unique(C) : - std::make_unique(C)); + new PPCPreRAMachineScheduler(C, std::make_unique(C)); // add DAG Mutations here. DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI)); if (ST.hasStoreFusion()) @@ -284,10 +282,8 @@ static ScheduleDAGInstrs *createPPCPostMachineScheduler( MachineSchedContext *C) { const PPCSubtarget &ST = C->MF->getSubtarget(); - ScheduleDAGMI *DAG = - new ScheduleDAGMI(C, ST.usePPCPostRASchedStrategy() ? - std::make_unique(C) : - std::make_unique(C), true); + ScheduleDAGMI *DAG = new PPCPostRAMachineScheduler( + C, std::make_unique(C), true); // add DAG Mutations here. if (ST.hasStoreFusion()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); diff --git a/llvm/test/CodeGen/PowerPC/botheightreduce.mir b/llvm/test/CodeGen/PowerPC/botheightreduce.mir --- a/llvm/test/CodeGen/PowerPC/botheightreduce.mir +++ b/llvm/test/CodeGen/PowerPC/botheightreduce.mir @@ -26,7 +26,6 @@ ; CHECK: [[LI8_6:%[0-9]+]]:g8rc = LI8 7 ; CHECK: bb.1: ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; CHECK: [[ADDI8_1:%[0-9]+]]:g8rc = ADDI8 [[ADDI8_]], 1 ; CHECK: [[LD:%[0-9]+]]:g8rc = LD 0, [[ADDI8_]] :: (load 8) ; CHECK: [[LDX:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_]] :: (load 8) ; CHECK: [[LDX1:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_3]] :: (load 8) @@ -34,9 +33,10 @@ ; CHECK: [[LDX2:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_4]] :: (load 8) ; CHECK: [[LDX3:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_5]] :: (load 8) ; CHECK: [[LDX4:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_6]] :: (load 8) - ; CHECK: [[LDX5:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_2]] :: (load 8) - ; CHECK: [[MULLD:%[0-9]+]]:g8rc = MULLD [[LDX]], [[LD]] ; CHECK: [[LD2:%[0-9]+]]:g8rc = LD 8, [[ADDI8_]] :: (load 8) + ; CHECK: [[MULLD:%[0-9]+]]:g8rc = MULLD [[LDX]], [[LD]] + ; CHECK: [[LDX5:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_2]] :: (load 8) + ; CHECK: [[ADDI8_1:%[0-9]+]]:g8rc = ADDI8 [[ADDI8_]], 1 ; CHECK: [[MULLD1:%[0-9]+]]:g8rc = MULLD [[MULLD]], [[LDX5]] ; CHECK: [[MULLD2:%[0-9]+]]:g8rc = MULLD [[MULLD1]], [[LDX1]] ; CHECK: [[MULLD3:%[0-9]+]]:g8rc = MULLD [[MULLD2]], [[LD1]] diff --git a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll --- a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll +++ b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll @@ -116,15 +116,14 @@ ; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB1_2: -; CHECK-NEXT: ldx r9, r6, r7 -; CHECK-NEXT: ld r10, 0(r6) -; CHECK-NEXT: ldx r11, r6, r5 -; CHECK-NEXT: addi r8, r6, 1 -; CHECK-NEXT: ld r6, 4(r6) -; CHECK-NEXT: mulld r9, r10, r9 -; CHECK-NEXT: mulld r9, r9, r11 -; CHECK-NEXT: maddld r3, r9, r6, r3 -; CHECK-NEXT: mr r6, r8 +; CHECK-NEXT: ldx r8, r6, r7 +; CHECK-NEXT: ld r9, 0(r6) +; CHECK-NEXT: ldx r10, r6, r5 +; CHECK-NEXT: ld r11, 4(r6) +; CHECK-NEXT: addi r6, r6, 1 +; CHECK-NEXT: mulld r8, r9, r8 +; CHECK-NEXT: mulld r8, r8, r10 +; CHECK-NEXT: maddld r3, r8, r11, r3 ; CHECK-NEXT: bdnz .LBB1_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: add r3, r3, r4 @@ -217,25 +216,24 @@ ; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB2_2: -; CHECK-NEXT: ldx r12, r9, r6 -; CHECK-NEXT: ld r0, 0(r9) -; CHECK-NEXT: ldx r30, r9, r5 -; CHECK-NEXT: ldx r29, r9, r7 -; CHECK-NEXT: addi r11, r9, 1 -; CHECK-NEXT: mulld r12, r0, r12 -; CHECK-NEXT: ld r28, 4(r9) -; CHECK-NEXT: ldx r27, r9, r8 -; CHECK-NEXT: ld r26, 12(r9) -; CHECK-NEXT: ld r25, 8(r9) -; CHECK-NEXT: ldx r9, r9, r10 -; CHECK-NEXT: mulld r12, r12, r30 -; CHECK-NEXT: mulld r12, r12, r29 -; CHECK-NEXT: mulld r12, r12, r28 -; CHECK-NEXT: mulld r12, r12, r27 -; CHECK-NEXT: mulld r12, r12, r26 -; CHECK-NEXT: mulld r12, r12, r25 -; CHECK-NEXT: maddld r3, r12, r9, r3 -; CHECK-NEXT: mr r9, r11 +; CHECK-NEXT: ldx r11, r9, r6 +; CHECK-NEXT: ld r12, 0(r9) +; CHECK-NEXT: ldx r0, r9, r5 +; CHECK-NEXT: ldx r30, r9, r7 +; CHECK-NEXT: mulld r11, r12, r11 +; CHECK-NEXT: ld r29, 4(r9) +; CHECK-NEXT: ldx r28, r9, r8 +; CHECK-NEXT: ld r27, 12(r9) +; CHECK-NEXT: ld r26, 8(r9) +; CHECK-NEXT: ldx r25, r9, r10 +; CHECK-NEXT: addi r9, r9, 1 +; CHECK-NEXT: mulld r11, r11, r0 +; CHECK-NEXT: mulld r11, r11, r30 +; CHECK-NEXT: mulld r11, r11, r29 +; CHECK-NEXT: mulld r11, r11, r28 +; CHECK-NEXT: mulld r11, r11, r27 +; CHECK-NEXT: mulld r11, r11, r26 +; CHECK-NEXT: maddld r3, r11, r25, r3 ; CHECK-NEXT: bdnz .LBB2_2 ; CHECK-NEXT: b .LBB2_4 ; CHECK-NEXT: .LBB2_3: @@ -624,10 +622,10 @@ ; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-NEXT: beq cr0, .LBB6_8 ; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha ; CHECK-NEXT: cmpldi r4, 1 ; CHECK-NEXT: li r7, 1 ; CHECK-NEXT: addi r6, r3, 4009 -; CHECK-NEXT: addis r5, r2, .LC0@toc@ha ; CHECK-NEXT: ld r5, .LC0@toc@l(r5) ; CHECK-NEXT: iselgt r8, r4, r7 ; CHECK-NEXT: lis r4, -21846 @@ -639,11 +637,11 @@ ; CHECK-NEXT: li r30, 1 ; CHECK-NEXT: ld r5, 0(r5) ; CHECK-NEXT: mtctr r8 -; CHECK-NEXT: li r8, -9 -; CHECK-NEXT: addi r5, r5, -1 ; CHECK-NEXT: ori r4, r4, 43691 +; CHECK-NEXT: li r8, -9 ; CHECK-NEXT: li r29, 1 ; CHECK-NEXT: li r28, 1 +; CHECK-NEXT: addi r5, r5, -1 ; CHECK-NEXT: b .LBB6_4 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB6_2: @@ -652,8 +650,8 @@ ; CHECK-NEXT: ld r0, -8(r6) ; CHECK-NEXT: add r29, r0, r29 ; CHECK-NEXT: .LBB6_3: -; CHECK-NEXT: addi r6, r6, 1 ; CHECK-NEXT: mulld r0, r29, r28 +; CHECK-NEXT: addi r6, r6, 1 ; CHECK-NEXT: mulld r0, r0, r30 ; CHECK-NEXT: mulld r0, r0, r12 ; CHECK-NEXT: mulld r0, r0, r11 @@ -802,8 +800,8 @@ ; CHECK-NEXT: cmpwi r4, 1 ; CHECK-NEXT: blt cr0, .LBB7_4 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: addi r3, r3, 4002 ; CHECK-NEXT: clrldi r4, r4, 32 +; CHECK-NEXT: addi r3, r3, 4002 ; CHECK-NEXT: xxlxor f1, f1, f1 ; CHECK-NEXT: mtctr r4 ; CHECK-NEXT: li r4, -1 @@ -884,8 +882,8 @@ ; CHECK-NEXT: cmpwi r4, 1 ; CHECK-NEXT: blt cr0, .LBB8_4 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: addi r3, r3, 4002 ; CHECK-NEXT: clrldi r4, r4, 32 +; CHECK-NEXT: addi r3, r3, 4002 ; CHECK-NEXT: xxlxor f1, f1, f1 ; CHECK-NEXT: mtctr r4 ; CHECK-NEXT: li r4, -1 diff --git a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll --- a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll +++ b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll @@ -400,9 +400,9 @@ ; CHECK-NEXT: .LBB9_2: # %for.body ; CHECK-NEXT: # ; CHECK-NEXT: rldic r7, r6, 4, 28 -; CHECK-NEXT: addi r6, r6, 6 ; CHECK-NEXT: xxsetaccz acc2 ; CHECK-NEXT: xxsetaccz acc1 +; CHECK-NEXT: addi r6, r6, 6 ; CHECK-NEXT: lxvx vs0, r5, r7 ; CHECK-NEXT: add r7, r5, r7 ; CHECK-NEXT: lxv vs1, 16(r7) @@ -414,8 +414,8 @@ ; CHECK-NEXT: lxv vs12, 64(r7) ; CHECK-NEXT: lxv vs13, 80(r7) ; CHECK-NEXT: rldic r7, r4, 6, 26 -; CHECK-NEXT: addi r4, r4, 3 ; CHECK-NEXT: xxsetaccz acc0 +; CHECK-NEXT: addi r4, r4, 3 ; CHECK-NEXT: xxmfacc acc1 ; CHECK-NEXT: xvf32gernp acc0, vs12, vs13 ; CHECK-NEXT: stxvx vs11, r3, r7 @@ -449,9 +449,9 @@ ; CHECK-BE-NEXT: .LBB9_2: # %for.body ; CHECK-BE-NEXT: # ; CHECK-BE-NEXT: rldic r7, r6, 4, 28 -; CHECK-BE-NEXT: addi r6, r6, 6 ; CHECK-BE-NEXT: xxsetaccz acc2 ; CHECK-BE-NEXT: xxsetaccz acc1 +; CHECK-BE-NEXT: addi r6, r6, 6 ; CHECK-BE-NEXT: lxvx vs0, r5, r7 ; CHECK-BE-NEXT: add r7, r5, r7 ; CHECK-BE-NEXT: lxv vs1, 16(r7) @@ -463,8 +463,8 @@ ; CHECK-BE-NEXT: lxv vs12, 64(r7) ; CHECK-BE-NEXT: lxv vs13, 80(r7) ; CHECK-BE-NEXT: rldic r7, r4, 6, 26 -; CHECK-BE-NEXT: addi r4, r4, 3 ; CHECK-BE-NEXT: xxsetaccz acc0 +; CHECK-BE-NEXT: addi r4, r4, 3 ; CHECK-BE-NEXT: xxmfacc acc1 ; CHECK-BE-NEXT: xvf32gernp acc0, vs12, vs13 ; CHECK-BE-NEXT: stxvx vs8, r3, r7 @@ -544,8 +544,7 @@ declare i32 @testRedundantPrimeUnprimeF() define void @testRedundantPrimeUnprime(<512 x i1>* %dst, <16 x i8> %vc) nounwind { ; CHECK-LABEL: testRedundantPrimeUnprime: -; CHECK: .localentry testRedundantPrimeUnprime, 1 -; CHECK-NEXT: # %bb.0: # %entry +; CHECK: # %bb.0: # %entry ; CHECK-NEXT: mflr r0 ; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r0, 16(r1) diff --git a/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll b/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll --- a/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll +++ b/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll @@ -17,17 +17,17 @@ ; CHECK-NEXT: xxsetaccz acc0 ; CHECK-NEXT: blt cr0, .LBB0_3 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: clrldi r6, r5, 32 -; CHECK-NEXT: addi r5, r4, 32 -; CHECK-NEXT: addi r6, r6, -2 +; CHECK-NEXT: clrldi r5, r5, 32 ; CHECK-NEXT: lxv vs4, 0(r4) ; CHECK-NEXT: lxv vs5, 16(r4) -; CHECK-NEXT: mtctr r6 +; CHECK-NEXT: addi r4, r4, 32 +; CHECK-NEXT: addi r5, r5, -2 +; CHECK-NEXT: mtctr r5 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: # %for.body ; CHECK-NEXT: # -; CHECK-NEXT: lxv vs6, 0(r5) -; CHECK-NEXT: addi r5, r5, 16 +; CHECK-NEXT: lxv vs6, 0(r4) +; CHECK-NEXT: addi r4, r4, 16 ; CHECK-NEXT: xvf64gerpp acc0, vsp4, vs6 ; CHECK-NEXT: bdnz .LBB0_2 ; CHECK-NEXT: .LBB0_3: # %for.cond.cleanup @@ -44,17 +44,17 @@ ; CHECK-BE-NEXT: xxsetaccz acc0 ; CHECK-BE-NEXT: blt cr0, .LBB0_3 ; CHECK-BE-NEXT: # %bb.1: # %for.body.preheader -; CHECK-BE-NEXT: clrldi r6, r5, 32 -; CHECK-BE-NEXT: addi r5, r4, 32 -; CHECK-BE-NEXT: addi r6, r6, -2 +; CHECK-BE-NEXT: clrldi r5, r5, 32 ; CHECK-BE-NEXT: lxv vs4, 0(r4) ; CHECK-BE-NEXT: lxv vs5, 16(r4) -; CHECK-BE-NEXT: mtctr r6 +; CHECK-BE-NEXT: addi r4, r4, 32 +; CHECK-BE-NEXT: addi r5, r5, -2 +; CHECK-BE-NEXT: mtctr r5 ; CHECK-BE-NEXT: .p2align 4 ; CHECK-BE-NEXT: .LBB0_2: # %for.body ; CHECK-BE-NEXT: # -; CHECK-BE-NEXT: lxv vs6, 0(r5) -; CHECK-BE-NEXT: addi r5, r5, 16 +; CHECK-BE-NEXT: lxv vs6, 0(r4) +; CHECK-BE-NEXT: addi r4, r4, 16 ; CHECK-BE-NEXT: xvf64gerpp acc0, vsp4, vs6 ; CHECK-BE-NEXT: bdnz .LBB0_2 ; CHECK-BE-NEXT: .LBB0_3: # %for.cond.cleanup @@ -115,8 +115,8 @@ ; CHECK-NEXT: xvf64ger acc0, vsp4, vs6 ; CHECK-NEXT: blt cr0, .LBB1_3 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: addi r4, r4, 48 ; CHECK-NEXT: clrldi r5, r5, 32 +; CHECK-NEXT: addi r4, r4, 48 ; CHECK-NEXT: addi r5, r5, -3 ; CHECK-NEXT: mtctr r5 ; CHECK-NEXT: .p2align 4 @@ -143,8 +143,8 @@ ; CHECK-BE-NEXT: xvf64ger acc0, vsp4, vs6 ; CHECK-BE-NEXT: blt cr0, .LBB1_3 ; CHECK-BE-NEXT: # %bb.1: # %for.body.preheader -; CHECK-BE-NEXT: addi r4, r4, 48 ; CHECK-BE-NEXT: clrldi r5, r5, 32 +; CHECK-BE-NEXT: addi r4, r4, 48 ; CHECK-BE-NEXT: addi r5, r5, -3 ; CHECK-BE-NEXT: mtctr r5 ; CHECK-BE-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll b/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll --- a/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll +++ b/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll @@ -9,10 +9,9 @@ define void @foo(i32* %.m, i32* %.n, [0 x %_elem_type_of_a]* %.a, [0 x %_elem_type_of_x]* %.x, i32* %.l, <2 x double>* %.vy01, <2 x double>* %.vy02, <2 x double>* %.vy03, <2 x double>* %.vy04, <2 x double>* %.vy05, <2 x double>* %.vy06, <2 x double>* %.vy07, <2 x double>* %.vy08, <2 x double>* %.vy09, <2 x double>* %.vy0a, <2 x double>* %.vy0b, <2 x double>* %.vy0c, <2 x double>* %.vy21, <2 x double>* %.vy22, <2 x double>* %.vy23, <2 x double>* %.vy24, <2 x double>* %.vy25, <2 x double>* %.vy26, <2 x double>* %.vy27, <2 x double>* %.vy28, <2 x double>* %.vy29, <2 x double>* %.vy2a, <2 x double>* %.vy2b, <2 x double>* %.vy2c) { ; CHECK-LABEL: foo: -; CHECK: .localentry foo, 1 -; CHECK-NEXT: # %bb.0: # %entry -; CHECK-NEXT: stdu 1, -448(1) -; CHECK-NEXT: .cfi_def_cfa_offset 448 +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stdu 1, -480(1) +; CHECK-NEXT: .cfi_def_cfa_offset 480 ; CHECK-NEXT: .cfi_offset r14, -256 ; CHECK-NEXT: .cfi_offset r15, -248 ; CHECK-NEXT: .cfi_offset r16, -240 @@ -46,308 +45,318 @@ ; CHECK-NEXT: .cfi_offset f30, -16 ; CHECK-NEXT: .cfi_offset f31, -8 ; CHECK-NEXT: lwz 4, 0(4) -; CHECK-NEXT: std 14, 192(1) # 8-byte Folded Spill -; CHECK-NEXT: std 15, 200(1) # 8-byte Folded Spill +; CHECK-NEXT: std 14, 224(1) # 8-byte Folded Spill +; CHECK-NEXT: std 15, 232(1) # 8-byte Folded Spill ; CHECK-NEXT: cmpwi 4, 1 -; CHECK-NEXT: std 16, 208(1) # 8-byte Folded Spill -; CHECK-NEXT: std 17, 216(1) # 8-byte Folded Spill -; CHECK-NEXT: std 18, 224(1) # 8-byte Folded Spill -; CHECK-NEXT: std 19, 232(1) # 8-byte Folded Spill -; CHECK-NEXT: std 20, 240(1) # 8-byte Folded Spill -; CHECK-NEXT: std 21, 248(1) # 8-byte Folded Spill -; CHECK-NEXT: std 22, 256(1) # 8-byte Folded Spill -; CHECK-NEXT: std 23, 264(1) # 8-byte Folded Spill -; CHECK-NEXT: std 24, 272(1) # 8-byte Folded Spill -; CHECK-NEXT: std 25, 280(1) # 8-byte Folded Spill -; CHECK-NEXT: std 26, 288(1) # 8-byte Folded Spill -; CHECK-NEXT: std 27, 296(1) # 8-byte Folded Spill -; CHECK-NEXT: std 28, 304(1) # 8-byte Folded Spill -; CHECK-NEXT: std 29, 312(1) # 8-byte Folded Spill -; CHECK-NEXT: std 30, 320(1) # 8-byte Folded Spill -; CHECK-NEXT: std 31, 328(1) # 8-byte Folded Spill -; CHECK-NEXT: stfd 18, 336(1) # 8-byte Folded Spill -; CHECK-NEXT: stfd 19, 344(1) # 8-byte Folded Spill -; CHECK-NEXT: stfd 20, 352(1) # 8-byte Folded Spill -; CHECK-NEXT: stfd 21, 360(1) # 8-byte Folded Spill -; CHECK-NEXT: stfd 22, 368(1) # 8-byte Folded Spill -; CHECK-NEXT: stfd 23, 376(1) # 8-byte Folded Spill -; CHECK-NEXT: stfd 24, 384(1) # 8-byte Folded Spill -; CHECK-NEXT: stfd 25, 392(1) # 8-byte Folded Spill -; CHECK-NEXT: stfd 26, 400(1) # 8-byte Folded Spill -; CHECK-NEXT: stfd 27, 408(1) # 8-byte Folded Spill -; CHECK-NEXT: stfd 28, 416(1) # 8-byte Folded Spill -; CHECK-NEXT: stfd 29, 424(1) # 8-byte Folded Spill -; CHECK-NEXT: stfd 30, 432(1) # 8-byte Folded Spill -; CHECK-NEXT: stfd 31, 440(1) # 8-byte Folded Spill +; CHECK-NEXT: std 16, 240(1) # 8-byte Folded Spill +; CHECK-NEXT: std 17, 248(1) # 8-byte Folded Spill +; CHECK-NEXT: std 18, 256(1) # 8-byte Folded Spill +; CHECK-NEXT: std 19, 264(1) # 8-byte Folded Spill +; CHECK-NEXT: std 20, 272(1) # 8-byte Folded Spill +; CHECK-NEXT: std 21, 280(1) # 8-byte Folded Spill +; CHECK-NEXT: std 22, 288(1) # 8-byte Folded Spill +; CHECK-NEXT: std 23, 296(1) # 8-byte Folded Spill +; CHECK-NEXT: std 24, 304(1) # 8-byte Folded Spill +; CHECK-NEXT: std 25, 312(1) # 8-byte Folded Spill +; CHECK-NEXT: std 26, 320(1) # 8-byte Folded Spill +; CHECK-NEXT: std 27, 328(1) # 8-byte Folded Spill +; CHECK-NEXT: std 28, 336(1) # 8-byte Folded Spill +; CHECK-NEXT: std 29, 344(1) # 8-byte Folded Spill +; CHECK-NEXT: std 30, 352(1) # 8-byte Folded Spill +; CHECK-NEXT: std 31, 360(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 18, 368(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 19, 376(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 20, 384(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 21, 392(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 22, 400(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 23, 408(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 24, 416(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 25, 424(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 26, 432(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 27, 440(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 28, 448(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 29, 456(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 30, 464(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 31, 472(1) # 8-byte Folded Spill ; CHECK-NEXT: blt 0, .LBB0_7 ; CHECK-NEXT: # %bb.1: # %_loop_1_do_.lr.ph -; CHECK-NEXT: lwz 3, 0(3) -; CHECK-NEXT: cmpwi 3, 1 +; CHECK-NEXT: mr 23, 5 +; CHECK-NEXT: lwz 5, 0(3) +; CHECK-NEXT: cmpwi 5, 1 ; CHECK-NEXT: blt 0, .LBB0_7 ; CHECK-NEXT: # %bb.2: # %_loop_1_do_.preheader -; CHECK-NEXT: addi 3, 3, 1 -; CHECK-NEXT: mr 24, 5 -; CHECK-NEXT: li 5, 9 -; CHECK-NEXT: mr 11, 7 -; CHECK-NEXT: ld 12, 640(1) -; CHECK-NEXT: std 9, 176(1) # 8-byte Folded Spill -; CHECK-NEXT: std 10, 184(1) # 8-byte Folded Spill -; CHECK-NEXT: mr 7, 6 -; CHECK-NEXT: ld 6, 544(1) -; CHECK-NEXT: lxv 1, 0(9) -; CHECK-NEXT: ld 9, 648(1) -; CHECK-NEXT: ld 29, 688(1) -; CHECK-NEXT: ld 28, 680(1) -; CHECK-NEXT: ld 2, 632(1) -; CHECK-NEXT: ld 26, 624(1) -; CHECK-NEXT: lxv 0, 0(10) -; CHECK-NEXT: cmpldi 3, 9 -; CHECK-NEXT: lxv 4, 0(8) -; CHECK-NEXT: ld 30, 664(1) -; CHECK-NEXT: ld 10, 704(1) -; CHECK-NEXT: ld 27, 672(1) -; CHECK-NEXT: ld 25, 616(1) -; CHECK-NEXT: ld 23, 608(1) -; CHECK-NEXT: ld 22, 600(1) -; CHECK-NEXT: ld 21, 592(1) -; CHECK-NEXT: ld 19, 584(1) -; CHECK-NEXT: ld 17, 576(1) -; CHECK-NEXT: iselgt 3, 3, 5 -; CHECK-NEXT: ld 5, 656(1) -; CHECK-NEXT: addi 3, 3, -2 -; CHECK-NEXT: lwa 20, 0(11) -; CHECK-NEXT: lxv 13, 0(12) -; CHECK-NEXT: std 6, 128(1) # 8-byte Folded Spill -; CHECK-NEXT: std 27, 136(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 2, 0(6) -; CHECK-NEXT: ld 6, 696(1) -; CHECK-NEXT: lxv 34, 0(2) -; CHECK-NEXT: lxv 7, 0(29) -; CHECK-NEXT: lxv 39, 0(17) -; CHECK-NEXT: lxv 38, 0(19) -; CHECK-NEXT: lxv 33, 0(21) -; CHECK-NEXT: lxv 32, 0(22) -; CHECK-NEXT: lxv 37, 0(23) -; CHECK-NEXT: lxv 36, 0(25) -; CHECK-NEXT: lxv 35, 0(26) -; CHECK-NEXT: lxv 11, 0(9) -; CHECK-NEXT: lxv 12, 0(30) -; CHECK-NEXT: rldicl 3, 3, 61, 3 -; CHECK-NEXT: addi 0, 3, 1 -; CHECK-NEXT: ld 3, 560(1) -; CHECK-NEXT: sldi 11, 20, 2 -; CHECK-NEXT: lxv 9, 0(5) -; CHECK-NEXT: lxv 10, 0(27) -; CHECK-NEXT: lxv 8, 0(28) -; CHECK-NEXT: lxv 6, 0(6) -; CHECK-NEXT: lxv 5, 0(10) -; CHECK-NEXT: lxv 3, 0(3) -; CHECK-NEXT: std 3, 96(1) # 8-byte Folded Spill -; CHECK-NEXT: std 12, 104(1) # 8-byte Folded Spill -; CHECK-NEXT: sldi 3, 20, 4 -; CHECK-NEXT: add 12, 20, 11 -; CHECK-NEXT: std 8, 168(1) # 8-byte Folded Spill -; CHECK-NEXT: std 6, 160(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 8, 552(1) -; CHECK-NEXT: sldi 18, 20, 1 +; CHECK-NEXT: addi 5, 5, 1 +; CHECK-NEXT: li 20, 9 +; CHECK-NEXT: ld 28, 728(1) +; CHECK-NEXT: ld 19, 616(1) +; CHECK-NEXT: lwa 3, 0(7) +; CHECK-NEXT: ld 7, 688(1) +; CHECK-NEXT: ld 12, 680(1) +; CHECK-NEXT: ld 11, 672(1) +; CHECK-NEXT: ld 2, 664(1) +; CHECK-NEXT: ld 29, 736(1) +; CHECK-NEXT: cmpldi 5, 9 +; CHECK-NEXT: ld 27, 720(1) +; CHECK-NEXT: ld 26, 712(1) +; CHECK-NEXT: ld 25, 704(1) +; CHECK-NEXT: ld 24, 696(1) +; CHECK-NEXT: iselgt 5, 5, 20 +; CHECK-NEXT: ld 30, 656(1) +; CHECK-NEXT: ld 22, 648(1) +; CHECK-NEXT: ld 21, 640(1) +; CHECK-NEXT: ld 20, 632(1) +; CHECK-NEXT: ld 18, 608(1) +; CHECK-NEXT: ld 17, 600(1) +; CHECK-NEXT: ld 16, 592(1) +; CHECK-NEXT: ld 14, 584(1) +; CHECK-NEXT: sldi 0, 3, 2 +; CHECK-NEXT: std 5, 216(1) # 8-byte Folded Spill +; CHECK-NEXT: std 28, 208(1) # 8-byte Folded Spill +; CHECK-NEXT: mr 5, 4 +; CHECK-NEXT: ld 4, 624(1) +; CHECK-NEXT: std 19, 96(1) # 8-byte Folded Spill +; CHECK-NEXT: std 4, 104(1) # 8-byte Folded Spill +; CHECK-NEXT: lxv 11, 0(4) +; CHECK-NEXT: mr 4, 5 +; CHECK-NEXT: ld 5, 216(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 15, 576(1) +; CHECK-NEXT: sldi 31, 3, 1 +; CHECK-NEXT: std 8, 32(1) # 8-byte Folded Spill +; CHECK-NEXT: std 9, 40(1) # 8-byte Folded Spill ; CHECK-NEXT: lxv 41, 0(8) -; CHECK-NEXT: add 3, 3, 24 -; CHECK-NEXT: addi 16, 3, 32 -; CHECK-NEXT: sldi 3, 20, 3 -; CHECK-NEXT: std 9, 112(1) # 8-byte Folded Spill -; CHECK-NEXT: std 5, 120(1) # 8-byte Folded Spill -; CHECK-NEXT: sldi 5, 12, 3 -; CHECK-NEXT: std 26, 80(1) # 8-byte Folded Spill -; CHECK-NEXT: std 2, 88(1) # 8-byte Folded Spill -; CHECK-NEXT: add 2, 24, 5 -; CHECK-NEXT: mr 9, 30 -; CHECK-NEXT: li 26, 1 -; CHECK-NEXT: add 3, 3, 24 -; CHECK-NEXT: addi 31, 3, 32 -; CHECK-NEXT: ld 3, 568(1) -; CHECK-NEXT: std 28, 144(1) # 8-byte Folded Spill -; CHECK-NEXT: std 29, 152(1) # 8-byte Folded Spill -; CHECK-NEXT: sldi 5, 20, 5 -; CHECK-NEXT: add 29, 20, 18 -; CHECK-NEXT: std 23, 64(1) # 8-byte Folded Spill -; CHECK-NEXT: std 25, 72(1) # 8-byte Folded Spill -; CHECK-NEXT: mulli 27, 20, 48 -; CHECK-NEXT: add 30, 24, 5 -; CHECK-NEXT: li 25, 0 -; CHECK-NEXT: lxv 40, 0(3) -; CHECK-NEXT: mulli 23, 20, 6 -; CHECK-NEXT: sldi 5, 29, 3 -; CHECK-NEXT: add 28, 24, 5 -; CHECK-NEXT: mr 5, 24 -; CHECK-NEXT: std 17, 32(1) # 8-byte Folded Spill -; CHECK-NEXT: std 19, 40(1) # 8-byte Folded Spill -; CHECK-NEXT: std 21, 48(1) # 8-byte Folded Spill -; CHECK-NEXT: std 22, 56(1) # 8-byte Folded Spill +; CHECK-NEXT: mr 8, 6 +; CHECK-NEXT: sldi 6, 3, 3 +; CHECK-NEXT: std 2, 144(1) # 8-byte Folded Spill +; CHECK-NEXT: std 11, 152(1) # 8-byte Folded Spill +; CHECK-NEXT: lxv 3, 0(2) +; CHECK-NEXT: lxv 2, 0(11) +; CHECK-NEXT: lxv 0, 0(7) +; CHECK-NEXT: add 6, 6, 23 +; CHECK-NEXT: lxv 7, 0(28) +; CHECK-NEXT: add 28, 3, 31 +; CHECK-NEXT: lxv 40, 0(9) +; CHECK-NEXT: lxv 39, 0(10) +; CHECK-NEXT: lxv 38, 0(15) +; CHECK-NEXT: lxv 33, 0(14) +; CHECK-NEXT: lxv 32, 0(16) +; CHECK-NEXT: lxv 37, 0(17) +; CHECK-NEXT: lxv 35, 0(18) +; CHECK-NEXT: lxv 13, 0(19) +; CHECK-NEXT: lxv 10, 0(20) +; CHECK-NEXT: lxv 8, 0(21) +; CHECK-NEXT: lxv 6, 0(22) +; CHECK-NEXT: lxv 4, 0(30) +; CHECK-NEXT: lxv 1, 0(12) +; CHECK-NEXT: lxv 36, 0(24) +; CHECK-NEXT: lxv 34, 0(25) +; CHECK-NEXT: lxv 12, 0(26) +; CHECK-NEXT: lxv 9, 0(27) +; CHECK-NEXT: lxv 5, 0(29) +; CHECK-NEXT: addi 5, 5, -2 +; CHECK-NEXT: sldi 11, 3, 4 +; CHECK-NEXT: std 12, 160(1) # 8-byte Folded Spill +; CHECK-NEXT: std 7, 168(1) # 8-byte Folded Spill +; CHECK-NEXT: add 7, 3, 0 +; CHECK-NEXT: add 12, 11, 23 +; CHECK-NEXT: addi 11, 6, 32 +; CHECK-NEXT: addi 12, 12, 32 +; CHECK-NEXT: std 22, 128(1) # 8-byte Folded Spill +; CHECK-NEXT: std 30, 136(1) # 8-byte Folded Spill +; CHECK-NEXT: std 26, 192(1) # 8-byte Folded Spill +; CHECK-NEXT: std 27, 200(1) # 8-byte Folded Spill +; CHECK-NEXT: mulli 26, 3, 48 +; CHECK-NEXT: mulli 22, 3, 6 +; CHECK-NEXT: sldi 6, 7, 3 +; CHECK-NEXT: add 30, 23, 6 +; CHECK-NEXT: std 29, 216(1) # 8-byte Folded Spill +; CHECK-NEXT: std 24, 176(1) # 8-byte Folded Spill +; CHECK-NEXT: std 25, 184(1) # 8-byte Folded Spill +; CHECK-NEXT: li 25, 1 +; CHECK-NEXT: li 24, 0 +; CHECK-NEXT: std 10, 48(1) # 8-byte Folded Spill +; CHECK-NEXT: std 15, 56(1) # 8-byte Folded Spill +; CHECK-NEXT: std 14, 64(1) # 8-byte Folded Spill +; CHECK-NEXT: std 16, 72(1) # 8-byte Folded Spill +; CHECK-NEXT: std 17, 80(1) # 8-byte Folded Spill +; CHECK-NEXT: std 18, 88(1) # 8-byte Folded Spill +; CHECK-NEXT: std 20, 112(1) # 8-byte Folded Spill +; CHECK-NEXT: std 21, 120(1) # 8-byte Folded Spill +; CHECK-NEXT: rldicl 5, 5, 61, 3 +; CHECK-NEXT: addi 2, 5, 1 +; CHECK-NEXT: sldi 5, 3, 5 +; CHECK-NEXT: add 29, 23, 5 +; CHECK-NEXT: sldi 5, 28, 3 +; CHECK-NEXT: add 27, 23, 5 +; CHECK-NEXT: mr 5, 23 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_3: # %_loop_2_do_.lr.ph ; CHECK-NEXT: # =>This Loop Header: Depth=1 ; CHECK-NEXT: # Child Loop BB0_4 Depth 2 -; CHECK-NEXT: maddld 6, 23, 25, 12 -; CHECK-NEXT: maddld 21, 23, 25, 11 -; CHECK-NEXT: mtctr 0 +; CHECK-NEXT: maddld 6, 22, 24, 7 +; CHECK-NEXT: maddld 20, 22, 24, 0 +; CHECK-NEXT: mtctr 2 ; CHECK-NEXT: sldi 6, 6, 3 -; CHECK-NEXT: add 22, 24, 6 -; CHECK-NEXT: sldi 6, 21, 3 -; CHECK-NEXT: add 21, 24, 6 -; CHECK-NEXT: maddld 6, 23, 25, 29 +; CHECK-NEXT: add 21, 23, 6 +; CHECK-NEXT: sldi 6, 20, 3 +; CHECK-NEXT: add 20, 23, 6 +; CHECK-NEXT: maddld 6, 22, 24, 28 ; CHECK-NEXT: sldi 6, 6, 3 -; CHECK-NEXT: add 19, 24, 6 -; CHECK-NEXT: maddld 6, 23, 25, 18 +; CHECK-NEXT: add 19, 23, 6 +; CHECK-NEXT: maddld 6, 22, 24, 31 ; CHECK-NEXT: sldi 6, 6, 3 -; CHECK-NEXT: add 17, 24, 6 -; CHECK-NEXT: maddld 6, 23, 25, 20 +; CHECK-NEXT: add 18, 23, 6 +; CHECK-NEXT: maddld 6, 22, 24, 3 ; CHECK-NEXT: sldi 6, 6, 3 -; CHECK-NEXT: add 15, 24, 6 -; CHECK-NEXT: mulld 6, 23, 25 +; CHECK-NEXT: add 17, 23, 6 +; CHECK-NEXT: mulld 6, 22, 24 ; CHECK-NEXT: sldi 6, 6, 3 -; CHECK-NEXT: add 14, 24, 6 -; CHECK-NEXT: mr 6, 7 +; CHECK-NEXT: add 16, 23, 6 +; CHECK-NEXT: mr 6, 8 ; CHECK-NEXT: .p2align 5 ; CHECK-NEXT: .LBB0_4: # %_loop_2_do_ ; CHECK-NEXT: # Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: # => This Inner Loop Header: Depth=2 ; CHECK-NEXT: lxvp 42, 0(6) -; CHECK-NEXT: lxvp 44, 0(14) -; CHECK-NEXT: lxvp 46, 0(15) -; CHECK-NEXT: lxvp 48, 0(17) +; CHECK-NEXT: lxvp 44, 0(16) +; CHECK-NEXT: lxvp 46, 0(17) +; CHECK-NEXT: lxvp 48, 0(18) ; CHECK-NEXT: lxvp 50, 0(19) -; CHECK-NEXT: lxvp 30, 0(21) -; CHECK-NEXT: lxvp 28, 0(22) +; CHECK-NEXT: lxvp 30, 0(20) +; CHECK-NEXT: lxvp 28, 0(21) ; CHECK-NEXT: lxvp 26, 32(6) -; CHECK-NEXT: lxvp 24, 32(14) -; CHECK-NEXT: lxvp 22, 32(15) -; CHECK-NEXT: lxvp 20, 32(17) +; CHECK-NEXT: lxvp 24, 32(16) +; CHECK-NEXT: lxvp 22, 32(17) +; CHECK-NEXT: lxvp 20, 32(18) ; CHECK-NEXT: lxvp 18, 32(19) ; CHECK-NEXT: addi 6, 6, 64 -; CHECK-NEXT: addi 14, 14, 64 -; CHECK-NEXT: addi 15, 15, 64 +; CHECK-NEXT: addi 16, 16, 64 ; CHECK-NEXT: addi 17, 17, 64 +; CHECK-NEXT: addi 18, 18, 64 ; CHECK-NEXT: addi 19, 19, 64 -; CHECK-NEXT: xvmaddadp 4, 45, 43 -; CHECK-NEXT: xvmaddadp 1, 47, 43 -; CHECK-NEXT: xvmaddadp 0, 49, 43 -; CHECK-NEXT: xvmaddadp 2, 51, 43 -; CHECK-NEXT: xvmaddadp 41, 31, 43 -; CHECK-NEXT: xvmaddadp 3, 29, 43 -; CHECK-NEXT: xvmaddadp 40, 44, 42 -; CHECK-NEXT: xvmaddadp 39, 46, 42 -; CHECK-NEXT: xvmaddadp 38, 48, 42 -; CHECK-NEXT: xvmaddadp 33, 50, 42 -; CHECK-NEXT: xvmaddadp 32, 30, 42 -; CHECK-NEXT: xvmaddadp 37, 28, 42 -; CHECK-NEXT: lxvp 42, 32(21) -; CHECK-NEXT: lxvp 44, 32(22) +; CHECK-NEXT: xvmaddadp 41, 45, 43 +; CHECK-NEXT: xvmaddadp 40, 47, 43 +; CHECK-NEXT: xvmaddadp 39, 49, 43 +; CHECK-NEXT: xvmaddadp 38, 51, 43 +; CHECK-NEXT: xvmaddadp 33, 31, 43 +; CHECK-NEXT: xvmaddadp 32, 29, 43 +; CHECK-NEXT: xvmaddadp 37, 44, 42 +; CHECK-NEXT: xvmaddadp 35, 46, 42 +; CHECK-NEXT: xvmaddadp 13, 48, 42 +; CHECK-NEXT: xvmaddadp 11, 50, 42 +; CHECK-NEXT: xvmaddadp 10, 30, 42 +; CHECK-NEXT: xvmaddadp 8, 28, 42 +; CHECK-NEXT: lxvp 42, 32(20) +; CHECK-NEXT: lxvp 44, 32(21) +; CHECK-NEXT: addi 20, 20, 64 ; CHECK-NEXT: addi 21, 21, 64 -; CHECK-NEXT: addi 22, 22, 64 -; CHECK-NEXT: xvmaddadp 36, 25, 27 -; CHECK-NEXT: xvmaddadp 35, 23, 27 -; CHECK-NEXT: xvmaddadp 34, 21, 27 -; CHECK-NEXT: xvmaddadp 13, 19, 27 -; CHECK-NEXT: xvmaddadp 12, 24, 26 -; CHECK-NEXT: xvmaddadp 10, 22, 26 -; CHECK-NEXT: xvmaddadp 8, 20, 26 -; CHECK-NEXT: xvmaddadp 7, 18, 26 -; CHECK-NEXT: xvmaddadp 11, 43, 27 -; CHECK-NEXT: xvmaddadp 9, 45, 27 -; CHECK-NEXT: xvmaddadp 6, 42, 26 +; CHECK-NEXT: xvmaddadp 6, 25, 27 +; CHECK-NEXT: xvmaddadp 4, 23, 27 +; CHECK-NEXT: xvmaddadp 3, 21, 27 +; CHECK-NEXT: xvmaddadp 2, 19, 27 +; CHECK-NEXT: xvmaddadp 36, 24, 26 +; CHECK-NEXT: xvmaddadp 34, 22, 26 +; CHECK-NEXT: xvmaddadp 12, 20, 26 +; CHECK-NEXT: xvmaddadp 9, 18, 26 +; CHECK-NEXT: xvmaddadp 1, 43, 27 +; CHECK-NEXT: xvmaddadp 0, 45, 27 +; CHECK-NEXT: xvmaddadp 7, 42, 26 ; CHECK-NEXT: xvmaddadp 5, 44, 26 ; CHECK-NEXT: bdnz .LBB0_4 ; CHECK-NEXT: # %bb.5: # %_loop_2_endl_ ; CHECK-NEXT: # -; CHECK-NEXT: addi 26, 26, 6 -; CHECK-NEXT: add 5, 5, 27 -; CHECK-NEXT: add 31, 31, 27 -; CHECK-NEXT: add 2, 2, 27 -; CHECK-NEXT: add 16, 16, 27 -; CHECK-NEXT: add 30, 30, 27 -; CHECK-NEXT: add 28, 28, 27 -; CHECK-NEXT: addi 25, 25, 1 -; CHECK-NEXT: cmpld 26, 4 +; CHECK-NEXT: addi 25, 25, 6 +; CHECK-NEXT: addi 24, 24, 1 +; CHECK-NEXT: add 5, 5, 26 +; CHECK-NEXT: add 11, 11, 26 +; CHECK-NEXT: add 30, 30, 26 +; CHECK-NEXT: add 12, 12, 26 +; CHECK-NEXT: add 29, 29, 26 +; CHECK-NEXT: add 27, 27, 26 +; CHECK-NEXT: cmpld 25, 4 ; CHECK-NEXT: ble 0, .LBB0_3 ; CHECK-NEXT: # %bb.6: # %_loop_1_loopHeader_._return_bb_crit_edge.loopexit -; CHECK-NEXT: ld 4, 168(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 4, 0(4) -; CHECK-NEXT: ld 4, 176(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 1, 0(4) -; CHECK-NEXT: ld 4, 184(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 0, 0(4) -; CHECK-NEXT: ld 4, 128(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 2, 0(4) -; CHECK-NEXT: ld 4, 96(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 41, 0(8) -; CHECK-NEXT: stxv 3, 0(4) -; CHECK-NEXT: stxv 40, 0(3) ; CHECK-NEXT: ld 3, 32(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 39, 0(3) +; CHECK-NEXT: stxv 41, 0(3) ; CHECK-NEXT: ld 3, 40(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 38, 0(3) +; CHECK-NEXT: stxv 40, 0(3) ; CHECK-NEXT: ld 3, 48(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 33, 0(3) +; CHECK-NEXT: stxv 39, 0(3) ; CHECK-NEXT: ld 3, 56(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 32, 0(3) +; CHECK-NEXT: stxv 38, 0(3) ; CHECK-NEXT: ld 3, 64(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 37, 0(3) +; CHECK-NEXT: stxv 33, 0(3) ; CHECK-NEXT: ld 3, 72(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 36, 0(3) +; CHECK-NEXT: stxv 32, 0(3) ; CHECK-NEXT: ld 3, 80(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 35, 0(3) +; CHECK-NEXT: stxv 37, 0(3) ; CHECK-NEXT: ld 3, 88(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 34, 0(3) -; CHECK-NEXT: ld 3, 104(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 35, 0(3) +; CHECK-NEXT: ld 3, 96(1) # 8-byte Folded Reload ; CHECK-NEXT: stxv 13, 0(3) -; CHECK-NEXT: ld 3, 112(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 3, 104(1) # 8-byte Folded Reload ; CHECK-NEXT: stxv 11, 0(3) +; CHECK-NEXT: ld 3, 112(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 10, 0(3) ; CHECK-NEXT: ld 3, 120(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 9, 0(3) +; CHECK-NEXT: stxv 8, 0(3) +; CHECK-NEXT: ld 3, 128(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 6, 0(3) ; CHECK-NEXT: ld 3, 136(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 12, 0(9) -; CHECK-NEXT: stxv 10, 0(3) +; CHECK-NEXT: stxv 4, 0(3) ; CHECK-NEXT: ld 3, 144(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 8, 0(3) +; CHECK-NEXT: stxv 3, 0(3) ; CHECK-NEXT: ld 3, 152(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 7, 0(3) +; CHECK-NEXT: stxv 2, 0(3) ; CHECK-NEXT: ld 3, 160(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 6, 0(3) -; CHECK-NEXT: stxv 5, 0(10) +; CHECK-NEXT: stxv 1, 0(3) +; CHECK-NEXT: ld 3, 168(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 0, 0(3) +; CHECK-NEXT: ld 3, 176(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 36, 0(3) +; CHECK-NEXT: ld 3, 184(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 34, 0(3) +; CHECK-NEXT: ld 3, 192(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 12, 0(3) +; CHECK-NEXT: ld 3, 200(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 9, 0(3) +; CHECK-NEXT: ld 3, 208(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 7, 0(3) +; CHECK-NEXT: ld 3, 216(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 5, 0(3) ; CHECK-NEXT: .LBB0_7: # %_return_bb -; CHECK-NEXT: lfd 31, 440(1) # 8-byte Folded Reload -; CHECK-NEXT: lfd 30, 432(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 31, 328(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 30, 320(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 29, 312(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 28, 304(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 27, 296(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 26, 288(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 25, 280(1) # 8-byte Folded Reload -; CHECK-NEXT: lfd 29, 424(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 24, 272(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 23, 264(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 22, 256(1) # 8-byte Folded Reload -; CHECK-NEXT: lfd 28, 416(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 21, 248(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 20, 240(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 19, 232(1) # 8-byte Folded Reload -; CHECK-NEXT: lfd 27, 408(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 18, 224(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 17, 216(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 16, 208(1) # 8-byte Folded Reload -; CHECK-NEXT: lfd 26, 400(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 15, 200(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 14, 192(1) # 8-byte Folded Reload -; CHECK-NEXT: lfd 25, 392(1) # 8-byte Folded Reload -; CHECK-NEXT: lfd 24, 384(1) # 8-byte Folded Reload -; CHECK-NEXT: lfd 23, 376(1) # 8-byte Folded Reload -; CHECK-NEXT: lfd 22, 368(1) # 8-byte Folded Reload -; CHECK-NEXT: lfd 21, 360(1) # 8-byte Folded Reload -; CHECK-NEXT: lfd 20, 352(1) # 8-byte Folded Reload -; CHECK-NEXT: lfd 19, 344(1) # 8-byte Folded Reload -; CHECK-NEXT: lfd 18, 336(1) # 8-byte Folded Reload -; CHECK-NEXT: addi 1, 1, 448 +; CHECK-NEXT: lfd 31, 472(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 30, 464(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 31, 360(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 30, 352(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 29, 344(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 28, 336(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 27, 328(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 26, 320(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 25, 312(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 29, 456(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 24, 304(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 23, 296(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 22, 288(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 28, 448(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 21, 280(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 20, 272(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 19, 264(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 27, 440(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 18, 256(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 17, 248(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 16, 240(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 26, 432(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 15, 232(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 14, 224(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 25, 424(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 24, 416(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 23, 408(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 22, 400(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 21, 392(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 20, 384(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 19, 376(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 18, 368(1) # 8-byte Folded Reload +; CHECK-NEXT: addi 1, 1, 480 ; CHECK-NEXT: blr entry: %_val_l_ = load i32, i32* %.l, align 4 diff --git a/llvm/test/CodeGen/PowerPC/rematerializable-instruction-machine-licm.ll b/llvm/test/CodeGen/PowerPC/rematerializable-instruction-machine-licm.ll --- a/llvm/test/CodeGen/PowerPC/rematerializable-instruction-machine-licm.ll +++ b/llvm/test/CodeGen/PowerPC/rematerializable-instruction-machine-licm.ll @@ -375,9 +375,9 @@ ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: ld 12, 384(1) # 8-byte Folded Reload ; CHECK-NEXT: lwz 4, 396(1) # 4-byte Folded Reload -; CHECK-NEXT: addi 4, 4, 1 ; CHECK-NEXT: std 3, 0(12) ; CHECK-NEXT: ld 12, 376(1) # 8-byte Folded Reload +; CHECK-NEXT: addi 4, 4, 1 ; CHECK-NEXT: std 3, 0(12) ; CHECK-NEXT: ld 12, 368(1) # 8-byte Folded Reload ; CHECK-NEXT: std 3, 0(12) diff --git a/llvm/test/CodeGen/PowerPC/sched-addi.ll b/llvm/test/CodeGen/PowerPC/sched-addi.ll --- a/llvm/test/CodeGen/PowerPC/sched-addi.ll +++ b/llvm/test/CodeGen/PowerPC/sched-addi.ll @@ -15,8 +15,8 @@ ; CHECK-P9-NEXT: ld 5, 0(5) ; CHECK-P9-NEXT: addis 6, 2, scalars@toc@ha ; CHECK-P9-NEXT: addi 6, 6, scalars@toc@l -; CHECK-P9-NEXT: addi 6, 6, 16 ; CHECK-P9-NEXT: rldicr 5, 5, 0, 58 +; CHECK-P9-NEXT: addi 6, 6, 16 ; CHECK-P9-NEXT: addi 5, 5, -32 ; CHECK-P9-NEXT: lxvdsx 0, 0, 6 ; CHECK-P9-NEXT: rldicl 5, 5, 59, 5 @@ -35,9 +35,9 @@ ; CHECK-P9-NEXT: xvmuldp 1, 1, 0 ; CHECK-P9-NEXT: xvmuldp 4, 4, 0 ; CHECK-P9-NEXT: xvmuldp 3, 3, 0 +; CHECK-P9-NEXT: xvmuldp 6, 6, 0 ; CHECK-P9-NEXT: xvmuldp 5, 5, 0 ; CHECK-P9-NEXT: addi 4, 4, 256 -; CHECK-P9-NEXT: xvmuldp 6, 6, 0 ; CHECK-P9-NEXT: stxv 1, 16(3) ; CHECK-P9-NEXT: stxv 2, 0(3) ; CHECK-P9-NEXT: stxv 3, 48(3) diff --git a/llvm/test/CodeGen/PowerPC/sms-cpy-1.ll b/llvm/test/CodeGen/PowerPC/sms-cpy-1.ll --- a/llvm/test/CodeGen/PowerPC/sms-cpy-1.ll +++ b/llvm/test/CodeGen/PowerPC/sms-cpy-1.ll @@ -22,22 +22,22 @@ ; CHECK-NEXT: isellt 3, 3, 4 ; CHECK-NEXT: li 4, 0 ; CHECK-NEXT: addi 3, 3, 1 -; CHECK-NEXT: li 7, -1 ; CHECK-NEXT: li 5, 0 +; CHECK-NEXT: li 7, -1 ; CHECK-NEXT: mtctr 3 -; CHECK-NEXT: li 3, 1 ; CHECK-NEXT: lbz 5, 0(5) +; CHECK-NEXT: li 3, 1 ; CHECK-NEXT: bdz .LBB0_6 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: addi 3, 3, 1 -; CHECK-NEXT: addi 8, 7, -1 ; CHECK-NEXT: xori 6, 5, 84 ; CHECK-NEXT: clrldi 5, 7, 32 +; CHECK-NEXT: addi 3, 3, 1 +; CHECK-NEXT: addi 8, 7, -1 ; CHECK-NEXT: lbz 5, 0(5) ; CHECK-NEXT: bdz .LBB0_5 ; CHECK-NEXT: # %bb.2: -; CHECK-NEXT: addi 3, 3, 1 ; CHECK-NEXT: cntlzw 6, 6 +; CHECK-NEXT: addi 3, 3, 1 ; CHECK-NEXT: srwi 7, 6, 5 ; CHECK-NEXT: xori 6, 5, 84 ; CHECK-NEXT: clrldi 5, 8, 32 @@ -46,12 +46,12 @@ ; CHECK-NEXT: bdz .LBB0_4 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: addi 3, 3, 1 ; CHECK-NEXT: clrldi 10, 8, 32 -; CHECK-NEXT: addi 8, 8, -1 ; CHECK-NEXT: cntlzw 9, 6 ; CHECK-NEXT: xori 6, 5, 84 +; CHECK-NEXT: addi 8, 8, -1 ; CHECK-NEXT: lbz 5, 0(10) +; CHECK-NEXT: addi 3, 3, 1 ; CHECK-NEXT: add 4, 4, 7 ; CHECK-NEXT: srwi 7, 9, 5 ; CHECK-NEXT: bdnz .LBB0_3 diff --git a/llvm/test/CodeGen/PowerPC/sms-phi-1.ll b/llvm/test/CodeGen/PowerPC/sms-phi-1.ll --- a/llvm/test/CodeGen/PowerPC/sms-phi-1.ll +++ b/llvm/test/CodeGen/PowerPC/sms-phi-1.ll @@ -14,17 +14,17 @@ ; CHECK-NEXT: mr 30, 3 ; CHECK-NEXT: bl calloc ; CHECK-NEXT: nop +; CHECK-NEXT: clrldi 4, 30, 32 ; CHECK-NEXT: li 5, 0 ; CHECK-NEXT: addi 3, 3, -4 -; CHECK-NEXT: li 6, 1 -; CHECK-NEXT: clrldi 4, 30, 32 ; CHECK-NEXT: mtctr 4 ; CHECK-NEXT: mullw 4, 5, 5 +; CHECK-NEXT: li 6, 1 ; CHECK-NEXT: bdz .LBB0_3 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: addi 5, 6, 1 ; CHECK-NEXT: stwu 4, 4(3) ; CHECK-NEXT: mullw 4, 6, 6 +; CHECK-NEXT: addi 5, 6, 1 ; CHECK-NEXT: bdz .LBB0_3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: diff --git a/llvm/test/CodeGen/PowerPC/sms-simple.ll b/llvm/test/CodeGen/PowerPC/sms-simple.ll --- a/llvm/test/CodeGen/PowerPC/sms-simple.ll +++ b/llvm/test/CodeGen/PowerPC/sms-simple.ll @@ -9,15 +9,15 @@ define dso_local i32* @foo() local_unnamed_addr { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis r5, r2, x@toc@ha -; CHECK-NEXT: addis r6, r2, y@toc@ha +; CHECK-NEXT: addis r5, r2, y@toc@ha ; CHECK-NEXT: li r7, 340 -; CHECK-NEXT: addi r5, r5, x@toc@l -; CHECK-NEXT: addi r5, r5, -8 -; CHECK-NEXT: addi r3, r6, y@toc@l -; CHECK-NEXT: lwz r6, y@toc@l(r6) +; CHECK-NEXT: addi r3, r5, y@toc@l +; CHECK-NEXT: lwz r6, y@toc@l(r5) +; CHECK-NEXT: addis r5, r2, x@toc@ha ; CHECK-NEXT: mtctr r7 +; CHECK-NEXT: addi r5, r5, x@toc@l ; CHECK-NEXT: addi r4, r3, -8 +; CHECK-NEXT: addi r5, r5, -8 ; CHECK-NEXT: lwzu r7, 12(r5) ; CHECK-NEXT: maddld r6, r7, r7, r6 ; CHECK-NEXT: lwz r7, 4(r5) diff --git a/llvm/test/CodeGen/PowerPC/stack-clash-dynamic-alloca.ll b/llvm/test/CodeGen/PowerPC/stack-clash-dynamic-alloca.ll --- a/llvm/test/CodeGen/PowerPC/stack-clash-dynamic-alloca.ll +++ b/llvm/test/CodeGen/PowerPC/stack-clash-dynamic-alloca.ll @@ -50,9 +50,9 @@ ; CHECK-P9-LE-NEXT: std r31, -8(r1) ; CHECK-P9-LE-NEXT: stdu r1, -48(r1) ; CHECK-P9-LE-NEXT: rldic r3, r3, 2, 30 -; CHECK-P9-LE-NEXT: addi r3, r3, 15 ; CHECK-P9-LE-NEXT: li r6, -32768 ; CHECK-P9-LE-NEXT: mr r31, r1 +; CHECK-P9-LE-NEXT: addi r3, r3, 15 ; CHECK-P9-LE-NEXT: rldicl r3, r3, 60, 4 ; CHECK-P9-LE-NEXT: rldicl r3, r3, 4, 29 ; CHECK-P9-LE-NEXT: neg r5, r3 @@ -189,9 +189,9 @@ ; CHECK-P9-LE-NEXT: std r31, -8(r1) ; CHECK-P9-LE-NEXT: stdu r1, -48(r1) ; CHECK-P9-LE-NEXT: rldic r4, r3, 2, 30 -; CHECK-P9-LE-NEXT: addi r4, r4, 15 ; CHECK-P9-LE-NEXT: li r7, -4096 ; CHECK-P9-LE-NEXT: mr r31, r1 +; CHECK-P9-LE-NEXT: addi r4, r4, 15 ; CHECK-P9-LE-NEXT: rldicl r4, r4, 60, 4 ; CHECK-P9-LE-NEXT: rldicl r4, r4, 4, 29 ; CHECK-P9-LE-NEXT: neg r6, r4 @@ -333,10 +333,10 @@ ; CHECK-P9-LE-NEXT: std r31, -8(r1) ; CHECK-P9-LE-NEXT: stdu r1, -48(r1) ; CHECK-P9-LE-NEXT: rldic r3, r3, 2, 30 -; CHECK-P9-LE-NEXT: addi r3, r3, 15 ; CHECK-P9-LE-NEXT: lis r5, -1 -; CHECK-P9-LE-NEXT: ori r5, r5, 0 ; CHECK-P9-LE-NEXT: mr r31, r1 +; CHECK-P9-LE-NEXT: addi r3, r3, 15 +; CHECK-P9-LE-NEXT: ori r5, r5, 0 ; CHECK-P9-LE-NEXT: rldicl r3, r3, 60, 4 ; CHECK-P9-LE-NEXT: rldicl r3, r3, 4, 29 ; CHECK-P9-LE-NEXT: neg r6, r3