Index: lib/CodeGen/MachineScheduler.cpp =================================================================== --- lib/CodeGen/MachineScheduler.cpp +++ lib/CodeGen/MachineScheduler.cpp @@ -405,6 +405,7 @@ // Initialize the context of the pass. MF = &mf; + MLI = &getAnalysis(); PassConfig = &getAnalysis(); if (VerifyScheduling) Index: lib/Target/SystemZ/SystemZHazardRecognizer.h =================================================================== --- lib/Target/SystemZ/SystemZHazardRecognizer.h +++ lib/Target/SystemZ/SystemZHazardRecognizer.h @@ -19,6 +19,13 @@ // * Processor resources usage. It is beneficial to balance the use of // resources. // +// A goal is to consider all instructions, also those outside of any +// scheduling region. Such instructions are "advanced" past and include +// single instructions before a scheduling region, branches etc. +// +// A block that has only one predecessor continues scheduling with the state +// of it (which may be updated by emitting branches). +// // ===---------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZHAZARDRECOGNIZER_H @@ -35,11 +42,29 @@ namespace llvm { -/// SystemZHazardRecognizer maintains the state during scheduling. +class SystemZHazardRecognizer; +typedef std::map MBB2HazRec; + +/// SystemZHazardRecognizer maintains the state for one MBB during +/// scheduling. class SystemZHazardRecognizer : public ScheduleHazardRecognizer { + // The global map of scheduler states. + MBB2HazRec *SchedStates; + ScheduleDAGMI *DAG; - const TargetSchedModel *SchedModel; + const SystemZInstrInfo *TII; + + // A SchedModel is needed before any DAG is built, while advancing past + // non-scheduled instructions. + TargetSchedModel SchedModel; + + // MBB and Loop that this HazardRecognizer will operate in. + MachineBasicBlock *MBB; + const MachineLoop *Loop; + + // Start of region + MachineBasicBlock::iterator Begin; /// Keep track of the number of decoder slots used in the current /// decoder group. @@ -52,17 +77,17 @@ /// Counters for the number of uops scheduled per processor /// resource. - SmallVector ProcResourceCounters; + SmallVector ResourceCounters; /// This is the resource with the greatest queue, which the /// scheduler tries to avoid. unsigned CriticalResourceIdx; /// Return the number of decoder slots MI requires. - inline unsigned getNumDecoderSlots(SUnit *SU) const; + inline unsigned getNumDecoderSlots(const MachineInstr *MI) const; /// Return true if MI fits into current decoder group. - bool fitsIntoCurrentGroup(SUnit *SU) const; + bool fitsIntoCurrentGroup(MachineInstr *MI) const; /// Two decoder groups per cycle are formed (for z13), meaning 2x3 /// instructions. This function returns a number between 0 and 5, @@ -76,29 +101,55 @@ /// A counter of decoder groups scheduled. unsigned GrpCount; - unsigned getCurrGroupSize() {return CurrGroupSize;}; - /// Start next decoder group. - void nextGroup(bool DbgOutput = true); + void nextGroup(); /// Clear all counters for processor resources. - void clearProcResCounters(); + void clearResourceCounters(); + + /// Last emitted instruction or nullptr. + MachineInstr *LastEmittedMI; /// With the goal of alternating processor sides for stalling (FPd) /// ops, return true if it seems good to schedule an FPd op next. bool isFPdOpPreferred_distance(const SUnit *SU); + /// There is no SU when advancing past non-scheduled instructions. + void emitInstruction(MachineInstr *MI, SUnit *SU = nullptr); + void emitInstructionIntoCurrentDecoderGroup(MachineInstr *MI); + /// Emit a branch in a predecessor, and return true if it is a taken branch + /// to MBB. + bool emitIncomingBranch(MachineInstr *MI); + + /// Update the scheduler state by emitting (non-scheduled) instructions + /// from I to NextBegin. + void advance(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator NextBegin); + + /// Take over state and continue scheduling from end of single predecessor. + void takeStateFromPred(); + + /// LastCall is used as an optimization in leaveMBB() so that in the case + /// of a call in MBB, the final state is achieved by looking at just + /// instructions after it. + MachineBasicBlock::iterator LastCall; + public: - SystemZHazardRecognizer(const MachineSchedContext *C); + SystemZHazardRecognizer(const TargetSubtargetInfo *ST, MBB2HazRec *SchedS_); + + void enterRegion(MachineBasicBlock *MBB_, + const MachineLoop *Loop_, + MachineBasicBlock::iterator Begin_); + + /// Called just before scheduling begins, with the DAG. + void initialize(ScheduleDAGMI *dag); - void setDAG(ScheduleDAGMI *dag) { - DAG = dag; - SchedModel = dag->getSchedModel(); - } - HazardType getHazardType(SUnit *m, int Stalls = 0) override; + void Reset() override; - void EmitInstruction(SUnit *SU) override; + void EmitInstruction(SUnit *SU) override { + emitInstruction(SU->getInstr(), SU); + } // Cost functions used by SystemZPostRASchedStrategy while // evaluating candidates. @@ -107,20 +158,30 @@ /// new decoder group, this is negative if this fits the schedule or /// positive if it would mean ending a group prematurely. For normal /// instructions this returns 0. - int groupingCost(SUnit *SU) const; + int groupingCost(const SUnit *SU) const; /// Return the cost of SU in regards to processor resources usage. /// A positive value means it would be better to wait with SU, while /// a negative value means it would be good to schedule SU next. - int resourcesCost(SUnit *SU); + int resourcesCost(const SUnit *SU); #ifndef NDEBUG // Debug dumping. std::string CurGroupDbg; // current group as text void dumpSU(SUnit *SU, raw_ostream &OS) const; + void dumpMI(MachineInstr *MI, raw_ostream &OS) const; void dumpCurrGroup(std::string Msg = "") const; - void dumpProcResourceCounters() const; + void dumpResourceCounters() const; #endif + + /// Remeber the last (in instruction list) call in MBB. + void setLastCall(MachineBasicBlock::iterator Call) { + if (LastCall == nullptr) + LastCall = Call; + } + + /// Leave MBB after scheduling is done. + void leaveMBB(); }; } // namespace llvm Index: lib/Target/SystemZ/SystemZHazardRecognizer.cpp =================================================================== --- lib/Target/SystemZ/SystemZHazardRecognizer.cpp +++ lib/Target/SystemZ/SystemZHazardRecognizer.cpp @@ -19,6 +19,13 @@ // * Processor resources usage. It is beneficial to balance the use of // resources. // +// A goal is to consider all instructions, also those outside of any +// scheduling region. Such instructions are "advanced" past and include +// single instructions before a scheduling region, branches etc. +// +// A block that has only one predecessor continues scheduling with the state +// of it (which may be updated by emitting branches). +// // ===---------------------------------------------------------------------===// #include "SystemZHazardRecognizer.h" @@ -26,23 +33,60 @@ using namespace llvm; -#define DEBUG_TYPE "misched" +#define DEBUG_TYPE "machine-scheduler" // This is the limit of processor resource usage at which the // scheduler should try to look for other instructions (not using the // critical resource). -static cl::opt ProcResCostLim("procres-cost-lim", cl::Hidden, +static cl::opt ResourceCostLim("procres-cost-lim", cl::Hidden, cl::desc("The OOO window for processor " "resources during scheduling."), cl::init(8)); SystemZHazardRecognizer:: -SystemZHazardRecognizer(const MachineSchedContext *C) : DAG(nullptr), - SchedModel(nullptr) {} +SystemZHazardRecognizer(const TargetSubtargetInfo *ST, MBB2HazRec *SchedS_) + : SchedStates(SchedS_), DAG(nullptr), TII(nullptr), + MBB(nullptr), Loop(nullptr), Begin(nullptr), CurrGroupSize(0), + LastFPdOpCycleIdx(UINT_MAX), GrpCount(0), LastEmittedMI(nullptr), + LastCall(nullptr) { + TII = static_cast(ST->getInstrInfo()); + SchedModel.init(ST->getSchedModel(), ST, TII); + clearResourceCounters(); +} + +void SystemZHazardRecognizer::enterRegion(MachineBasicBlock *MBB_, + const MachineLoop *Loop_, + MachineBasicBlock::iterator Begin_) { + Reset(); + MBB = MBB_; + Loop = Loop_; + Begin = Begin_; +} + +void SystemZHazardRecognizer::initialize(ScheduleDAGMI *dag) { + DAG = dag; + + // There may be non-scheduled instructions before Begin. Look backwards + // until beginning of block or a call. + MachineBasicBlock::iterator PreRegBegin = Begin; + for (; PreRegBegin != MBB->begin(); --PreRegBegin) { + if (std::prev(PreRegBegin)->isCall()) + break; + } + + // If this is top-most in MBB, try to take over the state from a single + // predecessor. + if (PreRegBegin == MBB->begin()) + takeStateFromPred(); + + // Emit any instructions before Begin. + advance(PreRegBegin, Begin); +} unsigned SystemZHazardRecognizer:: -getNumDecoderSlots(SUnit *SU) const { - const MCSchedClassDesc *SC = DAG->getSchedClass(SU); +getNumDecoderSlots(const MachineInstr *MI) const { + const MCSchedClassDesc *SC = SchedModel.resolveSchedClass(MI); + if (!SC->isValid()) return 0; // IMPLICIT_DEF / KILL -- will not make impact in output. @@ -58,27 +102,33 @@ unsigned SystemZHazardRecognizer::getCurrCycleIdx() { unsigned Idx = CurrGroupSize; - if (GrpCount % 2) + if ((GrpCount % 2) != 0) Idx += 3; return Idx; } ScheduleHazardRecognizer::HazardType SystemZHazardRecognizer:: getHazardType(SUnit *m, int Stalls) { - return (fitsIntoCurrentGroup(m) ? NoHazard : Hazard); + return (fitsIntoCurrentGroup(m->getInstr()) ? NoHazard : Hazard); } void SystemZHazardRecognizer::Reset() { + DAG = nullptr; + MBB = nullptr; + Loop = nullptr; + Begin = nullptr; CurrGroupSize = 0; - clearProcResCounters(); - GrpCount = 0; + clearResourceCounters(); LastFPdOpCycleIdx = UINT_MAX; - DEBUG(CurGroupDbg = "";); + GrpCount = 0; + LastEmittedMI = nullptr; + LastCall = nullptr; + DEBUG (CurGroupDbg = "";); } bool -SystemZHazardRecognizer::fitsIntoCurrentGroup(SUnit *SU) const { - const MCSchedClassDesc *SC = DAG->getSchedClass(SU); +SystemZHazardRecognizer::fitsIntoCurrentGroup(MachineInstr *MI) const { + const MCSchedClassDesc *SC = SchedModel.resolveSchedClass(MI); if (!SC->isValid()) return true; @@ -87,19 +137,19 @@ if (SC->BeginGroup) return (CurrGroupSize == 0); - // Since a full group is handled immediately in EmitInstruction(), + // Since a full group is handled immediately in emitInstruction(), // SU should fit into current group. NumSlots should be 1 or 0, // since it is not a cracked or expanded instruction. - assert ((getNumDecoderSlots(SU) <= 1) && (CurrGroupSize < 3) && + assert ((getNumDecoderSlots(MI) <= 1) && (CurrGroupSize < 3) && "Expected normal instruction to fit in non-full group!"); return true; } -void SystemZHazardRecognizer::nextGroup(bool DbgOutput) { +void SystemZHazardRecognizer::nextGroup() { if (CurrGroupSize > 0) { - DEBUG(dumpCurrGroup("Completed decode group")); - DEBUG(CurGroupDbg = "";); + DEBUG (dumpCurrGroup("Completed decode group")); + DEBUG (CurGroupDbg = "";); GrpCount++; @@ -107,35 +157,39 @@ CurrGroupSize = 0; // Decrease counters for execution units by one. - for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i) - if (ProcResourceCounters[i] > 0) - ProcResourceCounters[i]--; + for (unsigned i = 0; i < SchedModel.getNumProcResourceKinds(); ++i) + if (ResourceCounters[i] > 0) + ResourceCounters[i]--; // Clear CriticalResourceIdx if it is now below the threshold. if (CriticalResourceIdx != UINT_MAX && - (ProcResourceCounters[CriticalResourceIdx] <= - ProcResCostLim)) + (ResourceCounters[CriticalResourceIdx] <= ResourceCostLim)) CriticalResourceIdx = UINT_MAX; } - DEBUG(if (DbgOutput) - dumpProcResourceCounters();); + DEBUG (dumpResourceCounters();); } #ifndef NDEBUG // Debug output -void SystemZHazardRecognizer::dumpSU(SUnit *SU, raw_ostream &OS) const { +void SystemZHazardRecognizer::dumpSU(SUnit *SU, raw_ostream &OS) const{ OS << "SU(" << SU->NodeNum << "):"; - OS << SchedModel->getInstrInfo()->getName(SU->getInstr()->getOpcode()); + dumpMI(SU->getInstr(), OS); + if (SU->isUnbuffered) + OS << "/Unbuffered"; +} + +void SystemZHazardRecognizer::dumpMI(MachineInstr *MI, raw_ostream &OS) const{ + OS << TII->getName(MI->getOpcode()); + const MCSchedClassDesc *SC = SchedModel.resolveSchedClass(MI); - const MCSchedClassDesc *SC = DAG->getSchedClass(SU); if (!SC->isValid()) return; for (TargetSchedModel::ProcResIter - PI = SchedModel->getWriteProcResBegin(SC), - PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { + PI = SchedModel.getWriteProcResBegin(SC), + PE = SchedModel.getWriteProcResEnd(SC); PI != PE; ++PI) { const MCProcResourceDesc &PRD = - *SchedModel->getProcResource(PI->ProcResourceIdx); + *SchedModel.getProcResource(PI->ProcResourceIdx); std::string FU(PRD.Name); // trim e.g. Z13_FXaUnit -> FXa FU = FU.substr(FU.find("_") + 1); @@ -154,8 +208,6 @@ OS << "/BeginsGroup"; else if (SC->EndGroup) OS << "/EndsGroup"; - if (SU->isUnbuffered) - OS << "/Unbuffered"; } void SystemZHazardRecognizer::dumpCurrGroup(std::string Msg) const { @@ -172,11 +224,11 @@ } } -void SystemZHazardRecognizer::dumpProcResourceCounters() const { +void SystemZHazardRecognizer::dumpResourceCounters() const { bool any = false; - for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i) - if (ProcResourceCounters[i] > 0) { + for (unsigned i = 0; i < SchedModel.getNumProcResourceKinds(); ++i) + if (ResourceCounters[i] > 0) { any = true; break; } @@ -185,93 +237,146 @@ return; dbgs() << "+++ Resource counters:\n"; - for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i) - if (ProcResourceCounters[i] > 0) { + for (unsigned i = 0; i < SchedModel.getNumProcResourceKinds(); ++i) + if (ResourceCounters[i] > 0) { dbgs() << "+++ Extra schedule for execution unit " - << SchedModel->getProcResource(i)->Name - << ": " << ProcResourceCounters[i] << "\n"; - any = true; + << SchedModel.getProcResource(i)->Name + << ": " << ResourceCounters[i] << "\n"; } } #endif //NDEBUG -void SystemZHazardRecognizer::clearProcResCounters() { - ProcResourceCounters.assign(SchedModel->getNumProcResourceKinds(), 0); +void SystemZHazardRecognizer::clearResourceCounters() { + ResourceCounters.assign(SchedModel.getNumProcResourceKinds(), 0); CriticalResourceIdx = UINT_MAX; } -// Update state with SU as the next scheduled unit. +// Update state with MI as next instruction. If SU is null, this +// is e.g. a scheduling boundary. void SystemZHazardRecognizer:: -EmitInstruction(SUnit *SU) { - const MCSchedClassDesc *SC = DAG->getSchedClass(SU); - DEBUG( dumpCurrGroup("Decode group before emission");); +emitInstruction(MachineInstr *MI, SUnit *SU) { + assert (!MI->isBranch() && "Did not expect a branch here."); + assert (SU == nullptr || MI == SU->getInstr()); - // If scheduling an SU that must begin a new decoder group, move on - // to next group. - if (!fitsIntoCurrentGroup(SU)) + DEBUG (dumpCurrGroup("Decode group before emission");); + + // If scheduling an MI that must begin a new decoder group, do so. + if (!fitsIntoCurrentGroup(MI)) nextGroup(); - DEBUG( dbgs() << "+++ HazardRecognizer emitting "; dumpSU(SU, dbgs()); - dbgs() << "\n"; - raw_string_ostream cgd(CurGroupDbg); - if (CurGroupDbg.length()) - cgd << ", "; - dumpSU(SU, cgd);); + DEBUG (if (SU != nullptr) { + dbgs() << "+++ HazardRecognizer emitting "; dumpSU(SU, dbgs()); + dbgs() << "\n"; + raw_string_ostream cgd(CurGroupDbg); + if (CurGroupDbg.length()) + cgd << ", "; + dumpSU(SU, cgd); + } else { + dbgs() << "+++ Advancing past: "; + dumpMI(MI, dbgs()); + dbgs() << "\n"; + + raw_string_ostream cgd(CurGroupDbg); + if (CurGroupDbg.length()) + cgd << ", "; + cgd << TII->getName(MI->getOpcode()); + }); + + LastEmittedMI = MI; // After returning from a call, we don't know much about the state. - if (SU->getInstr()->isCall()) { + if (MI->isCall()) { DEBUG (dbgs() << "+++ Clearing state after call.\n";); - clearProcResCounters(); + clearResourceCounters(); LastFPdOpCycleIdx = UINT_MAX; - CurrGroupSize += getNumDecoderSlots(SU); + CurrGroupSize += getNumDecoderSlots(MI); assert (CurrGroupSize <= 3); nextGroup(); return; } + // Make note of an instruction that uses a blocking resource (FPd). + if ((SU != nullptr && SU->isUnbuffered)) { + LastFPdOpCycleIdx = getCurrCycleIdx(); + DEBUG (dbgs() << "+++ Last FPd cycle index: " + << LastFPdOpCycleIdx << "\n";); + } + + emitInstructionIntoCurrentDecoderGroup(MI); +} + +void SystemZHazardRecognizer:: +emitInstructionIntoCurrentDecoderGroup(MachineInstr *MI) { + const MCSchedClassDesc *SC = SchedModel.resolveSchedClass(MI); + // Increase counter for execution unit(s). for (TargetSchedModel::ProcResIter - PI = SchedModel->getWriteProcResBegin(SC), - PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { + PI = SchedModel.getWriteProcResBegin(SC), + PE = SchedModel.getWriteProcResEnd(SC); PI != PE; ++PI) { // Don't handle FPd together with the other resources. - if (SchedModel->getProcResource(PI->ProcResourceIdx)->BufferSize == 1) + if (SchedModel.getProcResource(PI->ProcResourceIdx)->BufferSize == 1) continue; int &CurrCounter = - ProcResourceCounters[PI->ProcResourceIdx]; + ResourceCounters[PI->ProcResourceIdx]; CurrCounter += PI->Cycles; // Check if this is now the new critical resource. - if ((CurrCounter > ProcResCostLim) && + if ((CurrCounter > ResourceCostLim) && (CriticalResourceIdx == UINT_MAX || (PI->ProcResourceIdx != CriticalResourceIdx && - CurrCounter > - ProcResourceCounters[CriticalResourceIdx]))) { - DEBUG( dbgs() << "+++ New critical resource: " - << SchedModel->getProcResource(PI->ProcResourceIdx)->Name + CurrCounter > ResourceCounters[CriticalResourceIdx]))) { + DEBUG (dbgs() << "+++ New critical resource: " + << SchedModel.getProcResource(PI->ProcResourceIdx)->Name << "\n";); CriticalResourceIdx = PI->ProcResourceIdx; } } - // Make note of an instruction that uses a blocking resource (FPd). - if (SU->isUnbuffered) { - LastFPdOpCycleIdx = getCurrCycleIdx(); - DEBUG (dbgs() << "+++ Last FPd cycle index: " - << LastFPdOpCycleIdx << "\n";); - } - - // Insert SU into current group by increasing number of slots used + // Insert MI into current group by increasing number of slots used // in current group. - CurrGroupSize += getNumDecoderSlots(SU); + CurrGroupSize += getNumDecoderSlots(MI); assert (CurrGroupSize <= 3); - // Check if current group is now full/ended. If so, move on to next - // group to be ready to evaluate more candidates. + // Check if current group is now full/ended. If so, reset counter to + // be ready to evaluate candidates again. if (CurrGroupSize == 3 || SC->EndGroup) nextGroup(); } -int SystemZHazardRecognizer::groupingCost(SUnit *SU) const { - const MCSchedClassDesc *SC = DAG->getSchedClass(SU); +bool SystemZHazardRecognizer::emitIncomingBranch(MachineInstr *MI) { + DEBUG (dbgs() << "+++ Emitting incoming branch: "; MI->dump();); + + // If scheduling an MI that must begin a new decoder group, do so. + if (!fitsIntoCurrentGroup(MI)) + nextGroup(); + + DEBUG ({ raw_string_ostream cgd(CurGroupDbg); + if (CurGroupDbg.length()) + cgd << ", "; + dumpMI(MI, cgd); }); + + emitInstructionIntoCurrentDecoderGroup(MI); + + if (MI->isBranch() && + (TII->getBranchInfo(*MI).Target->isReg() || // Relative branch + TII->getBranchInfo(*MI).Target->getMBB() == MBB)) { + // Taken branch from predecessor + if (CurrGroupSize > 0) + nextGroup(); + return true; + } + + assert ((MI->isBranch() || MI->isReturn() || MI->getOpcode() == SystemZ::CondTrap) && + "Scheduler: expected a branch or conditional return/trap"); + + // NT branches end group after first decoder slot. + if (CurrGroupSize == 2) + nextGroup(); + + return false; +} + +int SystemZHazardRecognizer::groupingCost(const SUnit *SU) const { + const MCSchedClassDesc *SC = SchedModel.resolveSchedClass(SU->getInstr()); if (!SC->isValid()) return 0; @@ -287,7 +392,7 @@ // end the group prematurely. if (SC->EndGroup) { unsigned resultingGroupSize = - (CurrGroupSize + getNumDecoderSlots(SU)); + (CurrGroupSize + getNumDecoderSlots(SU->getInstr())); if (resultingGroupSize < 3) return (3 - resultingGroupSize); return -1; @@ -312,10 +417,10 @@ } int SystemZHazardRecognizer:: -resourcesCost(SUnit *SU) { +resourcesCost(const SUnit *SU) { int Cost = 0; - const MCSchedClassDesc *SC = DAG->getSchedClass(SU); + const MCSchedClassDesc *SC = SchedModel.resolveSchedClass(SU->getInstr()); if (!SC->isValid()) return 0; @@ -326,8 +431,8 @@ // For other instructions, give a cost to the use of the critical resource. else if (CriticalResourceIdx != UINT_MAX) { for (TargetSchedModel::ProcResIter - PI = SchedModel->getWriteProcResBegin(SC), - PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) + PI = SchedModel.getWriteProcResBegin(SC), + PE = SchedModel.getWriteProcResEnd(SC); PI != PE; ++PI) if (PI->ProcResourceIdx == CriticalResourceIdx) Cost = PI->Cycles; } @@ -335,3 +440,89 @@ return Cost; } +void SystemZHazardRecognizer:: +advance(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator NextBegin) { + for (; I != NextBegin; ++I) { + if (I->isPosition() || I->isDebugValue()) + continue; + emitInstruction(&*I); + } +} + +// Try to find a single predecessor that would be interesting for the +// scheduler in the top-most region of MBB. +static MachineBasicBlock *getSingleSchedPred(MachineBasicBlock *MBB, + const MachineLoop *Loop) { + if (MBB->pred_size() == 1) + return *MBB->pred_begin(); + + // The loop header has two predecessors, return the latch, but not for a + // single block loop. + if (MBB->pred_size() == 2 && Loop != nullptr && Loop->getHeader() == MBB) { + for (auto I = MBB->pred_begin(); I != MBB->pred_end(); ++I) + if (Loop->contains(*I)) + return (*I == MBB ? nullptr : *I); + } + + return nullptr; +} + +void SystemZHazardRecognizer:: +takeStateFromPred() { + // Take state from single predecessor MBB, if it has been scheduled. + MachineBasicBlock *SinglePredMBB = getSingleSchedPred(MBB, Loop); + assert (!SinglePredMBB || !Loop || Loop->contains(SinglePredMBB)); + if (SinglePredMBB == nullptr || + SchedStates->find(SinglePredMBB) == SchedStates->end()) + return; + + // Be optimistic and assume that branch prediction will generally do "the + // right thing". + + // Get incoming scheduler state. + SystemZHazardRecognizer *incoming = (*SchedStates)[SinglePredMBB]; + DEBUG (dbgs() << "+++ Continued scheduling from MBB#" + << incoming->MBB->getNumber() << "\n";); + + // Current decoder group + CurrGroupSize = incoming->CurrGroupSize; + DEBUG (CurGroupDbg = incoming->CurGroupDbg;); + + // Processor resources + ResourceCounters = incoming->ResourceCounters; + CriticalResourceIdx = incoming->CriticalResourceIdx; + + // FPd + LastFPdOpCycleIdx = incoming->LastFPdOpCycleIdx; + GrpCount = incoming->GrpCount; + + // Emit incoming terminator(s). + for (MachineBasicBlock::iterator I = incoming->MBB->getFirstTerminator(); + I != incoming->MBB->end(); I++) + if (emitIncomingBranch(&*I)) + break; +} + +void SystemZHazardRecognizer::leaveMBB() { + DAG = nullptr; + + MachineBasicBlock::iterator I; + if (LastEmittedMI != nullptr) + // If scheduling was done, emit everything after the region. + I = std::next(MachineBasicBlock::iterator(LastEmittedMI)); + else if (LastCall != nullptr) + // Otherwise, emit everything after the last call in MBB, if there is + // one. + I = std::next(MachineBasicBlock::iterator(LastCall)); + else { + // Get the correct final state by emitting the whole MBB. + takeStateFromPred(); + I = MBB->begin(); + } + + // Advance to first terminator. The successor block will handle them in + // takeStateFromPred(). + advance(I, MBB->getFirstTerminator()); +} + Index: lib/Target/SystemZ/SystemZMachineScheduler.h =================================================================== --- lib/Target/SystemZ/SystemZMachineScheduler.h +++ lib/Target/SystemZ/SystemZMachineScheduler.h @@ -7,11 +7,12 @@ // //===----------------------------------------------------------------------===// // -// -------------------------- Post RA scheduling ---------------------------- // +// -------------------------- Post RA scheduling ---------------------------- // SystemZPostRASchedStrategy is a scheduling strategy which is plugged into -// the MachineScheduler. It has a sorted Available set of SUs and a pickNode() -// implementation that looks to optimize decoder grouping and balance the -// usage of processor resources. +// the MachineScheduler. It has a sorted Available set of SUs and a +// pickNode() implementation that looks to optimize decoder grouping and +// balance the usage of processor resources. Scheduler states are saved for +// the end region of each MBB, so that a successor block can learn from it. //===----------------------------------------------------------------------===// #include "SystemZHazardRecognizer.h" @@ -29,7 +30,7 @@ /// A MachineSchedStrategy implementation for SystemZ post RA scheduling. class SystemZPostRASchedStrategy : public MachineSchedStrategy { ScheduleDAGMI *DAG; - + /// A candidate during instruction evaluation. struct Candidate { SUnit *SU = nullptr; @@ -79,12 +80,34 @@ /// The set of available SUs to schedule next. SUSet Available; - // HazardRecognizer that tracks the scheduler state for the current - // region. - SystemZHazardRecognizer HazardRec; - + /// Maintain hazard recognizers for all blocks, so that the scheduler state + /// can be maintained past BB boundaries when appropariate. + MBB2HazRec SchedStates; + + /// HazardRecognizer that tracks the scheduler state for the bottom-most + /// region of each MBB. + SystemZHazardRecognizer *HazardRec; + + /// A temporary HazardRecognizer used for regions that are separated (by a + /// call) from the end region of the MBB. + SystemZHazardRecognizer TmpHazRec; + + /// Since there is no virtual leaveRegion() method, use a pointer to check + /// when scheduler has changed MBB. + MachineBasicBlock *PreviouslyVisitedMBB; + + /// Loops are checked so that headers can be identified in + /// takeStateFromPred(). + const MachineLoopInfo *MLI; + public: SystemZPostRASchedStrategy(const MachineSchedContext *C); + virtual ~SystemZPostRASchedStrategy(); + + /// Called for a region before scheduling. + void initPolicy(MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + unsigned NumRegionInstrs) override; /// PostRA scheduling does not track pressure. bool shouldTrackPressure() const override { return false; } Index: lib/Target/SystemZ/SystemZMachineScheduler.cpp =================================================================== --- lib/Target/SystemZ/SystemZMachineScheduler.cpp +++ lib/Target/SystemZ/SystemZMachineScheduler.cpp @@ -7,18 +7,19 @@ // //===----------------------------------------------------------------------===// // -// -------------------------- Post RA scheduling ---------------------------- // +// -------------------------- Post RA scheduling ---------------------------- // SystemZPostRASchedStrategy is a scheduling strategy which is plugged into -// the MachineScheduler. It has a sorted Available set of SUs and a pickNode() -// implementation that looks to optimize decoder grouping and balance the -// usage of processor resources. -//===----------------------------------------------------------------------===// +// the MachineScheduler. It has a sorted Available set of SUs and a +// pickNode() implementation that looks to optimize decoder grouping and +// balance the usage of processor resources. Scheduler states are saved for +// the end region of each MBB, so that a successor block can learn from it. +// ===----------------------------------------------------------------------===// #include "SystemZMachineScheduler.h" using namespace llvm; -#define DEBUG_TYPE "misched" +#define DEBUG_TYPE "machine-scheduler" #ifndef NDEBUG // Print the set of SUs @@ -36,12 +37,58 @@ SystemZPostRASchedStrategy:: SystemZPostRASchedStrategy(const MachineSchedContext *C) - : DAG(nullptr), HazardRec(C) {} + : DAG(nullptr), HazardRec(nullptr), + TmpHazRec(&C->MF->getSubtarget(), &SchedStates), + PreviouslyVisitedMBB(nullptr), MLI(C->MLI) {} + +SystemZPostRASchedStrategy::~SystemZPostRASchedStrategy() { + // Delete hazard recognizers kept around for each MBB. + for (auto I : SchedStates) { + SystemZHazardRecognizer *hazrec = I.second; + delete hazrec; + } +} + +void SystemZPostRASchedStrategy::initPolicy(MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + unsigned NumRegionInstrs) { + MachineBasicBlock *MBB = Begin->getParent(); + const MachineLoop *Loop = MLI->getLoopFor(MBB); + + // If a new MBB has been entered, finalize the previous MBB. + if (PreviouslyVisitedMBB != nullptr && PreviouslyVisitedMBB != MBB) + SchedStates.find(PreviouslyVisitedMBB)->second->leaveMBB(); + PreviouslyVisitedMBB = MBB; + + // We can maintain the scheder state perfectly even when scheduling regions + // in reverse order (bottom-up) in MBB, because the only scheduling + // boundaries we have are calls, which simply reset the state. + if (SchedStates.find(MBB) != SchedStates.end()) { + // Use the temporary HazardRecognizer for any regions above calls. + DEBUG (dbgs() << "+++ Continuing in MBB#" << MBB->getNumber()); + HazardRec = &TmpHazRec; + } else { + // First time in MBB is the bottom-most region. Make a new + // HazardRecognizer and save it for use by successor block. + DEBUG (dbgs() << "+++ Entering MBB#" << MBB->getNumber()); + HazardRec = new SystemZHazardRecognizer(&MBB->getParent()->getSubtarget(), + &SchedStates); + SchedStates[MBB] = HazardRec; + } + + DEBUG (if(Loop && Loop->getHeader() == MBB) + dbgs() << " (Loop header)"; + dbgs() << ":\n";); + + if (End != MBB->end() && End->isCall()) + SchedStates[MBB]->setLastCall(End); + + HazardRec->enterRegion(MBB, Loop, Begin); +} void SystemZPostRASchedStrategy::initialize(ScheduleDAGMI *dag) { DAG = dag; - HazardRec.setDAG(dag); - HazardRec.Reset(); + HazardRec->initialize(dag); } // Pick the next node to schedule. @@ -55,31 +102,32 @@ // If only one choice, return it. if (Available.size() == 1) { DEBUG (dbgs() << "+++ Only one: "; - HazardRec.dumpSU(*Available.begin(), dbgs()); dbgs() << "\n";); + HazardRec->dumpSU(*Available.begin(), dbgs()); dbgs() << "\n";); return *Available.begin(); } // All nodes that are possible to schedule are stored by in the // Available set. - DEBUG(dbgs() << "+++ Available: "; Available.dump(HazardRec);); + DEBUG (dbgs() << "+++ Available: "; Available.dump(*HazardRec);); Candidate Best; for (auto *SU : Available) { // SU is the next candidate to be compared against current Best. - Candidate c(SU, HazardRec); + Candidate c(SU, *HazardRec); // Remeber which SU is the best candidate. if (Best.SU == nullptr || c < Best) { Best = c; - DEBUG(dbgs() << "+++ Best sofar: "; - HazardRec.dumpSU(Best.SU, dbgs()); - if (Best.GroupingCost != 0) - dbgs() << "\tGrouping cost:" << Best.GroupingCost; - if (Best.ResourcesCost != 0) - dbgs() << " Resource cost:" << Best.ResourcesCost; - dbgs() << " Height:" << Best.SU->getHeight(); - dbgs() << "\n";); + DEBUG (dbgs() << "+++ Best sofar: "; + HazardRec->dumpSU(Best.SU, dbgs()); + if (Best.GroupingCost != 0) + dbgs() << "\tGrouping cost:" << Best.GroupingCost; + if (Best.ResourcesCost != 0) + dbgs() << Best.ResourcesCost; + + dbgs() << " Height:" << Best.SU->getHeight(); + dbgs() << "\n";); } // Once we know we have seen all SUs that affect grouping or use unbuffered @@ -134,11 +182,11 @@ } void SystemZPostRASchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { - DEBUG(dbgs() << "+++ Scheduling SU(" << SU->NodeNum << ")\n";); + DEBUG (dbgs() << "+++ Scheduling SU(" << SU->NodeNum << ")\n";); // Remove SU from Available set and update HazardRec. Available.erase(SU); - HazardRec.EmitInstruction(SU); + HazardRec->EmitInstruction(SU); } void SystemZPostRASchedStrategy::releaseTopNode(SUnit *SU) { Index: test/CodeGen/SystemZ/int-cmp-48.ll =================================================================== --- test/CodeGen/SystemZ/int-cmp-48.ll +++ test/CodeGen/SystemZ/int-cmp-48.ll @@ -29,8 +29,8 @@ define void @f2(i8 *%src) { ; CHECK-LABEL: f2: ; CHECK: llc [[REG:%r[0-5]]], 0(%r2) -; CHECK: tmll [[REG]], 1 -; CHECK: mvi 0(%r2), 0 +; CHECK-DAG: mvi 0(%r2), 0 +; CHECK-DAG: tmll [[REG]], 1 ; CHECK: ber %r14 ; CHECK: br %r14 entry: