Index: include/llvm/CodeGen/MachineModuloSched.h =================================================================== --- /dev/null +++ include/llvm/CodeGen/MachineModuloSched.h @@ -0,0 +1,190 @@ +//===--- llvm/CodeGen/MachineModuloSched.h - Common Base Class---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the data dependence graph class, which is used as the +// common base class for software pipeline schedulers. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_MACHINEMODULOSCHED_H +#define LLVM_CODEGEN_MACHINEMODULOSCHED_H + +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" +#include "llvm/PassAnalysisSupport.h" +#include "llvm/PassRegistry.h" +#include "llvm/PassSupport.h" + +namespace llvm { + +/// The main class in the implementation of the target independent +/// software pipeliner pass. +class MachineModuloSched : public MachineFunctionPass { +public: + MachineFunction *MF = nullptr; + const MachineLoopInfo *MLI = nullptr; + const MachineDominatorTree *MDT = nullptr; + const InstrItineraryData *InstrItins; + const TargetInstrInfo *TII = nullptr; + RegisterClassInfo RegClassInfo; + +#ifndef NDEBUG + static int NumTries; +#endif + /// Cache the target analysis information about the loop. + struct LoopInfo { + MachineBasicBlock *TBB = nullptr; + MachineBasicBlock *FBB = nullptr; + SmallVector BrCond; + MachineInstr *LoopInductionVar = nullptr; + MachineInstr *LoopCompare = nullptr; + }; + LoopInfo LI; + + static char ID; + MachineModuloSched() : MachineFunctionPass(ID) { + initializeMachineModuloSchedPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + bool canPipelineLoop(MachineLoop &L); + bool scheduleLoop(MachineLoop &L); + bool swpModuloScheduler(MachineLoop &L); +}; + +/// This class builds the dependence graph for the instructions in a loop, +/// and attempts to schedule the instructions using the modulo algorithm. +class ModuloSchedulerDAG : public ScheduleDAGInstrs { + MachineModuloSched &Pass; + /// The minimum initiation interval between iterations for this schedule. + unsigned MII; + /// Set to true if a valid pipelined schedule is found for the loop. + bool Scheduled; + MachineLoop &Loop; + LiveIntervals &LIS; + const RegisterClassInfo &RegClassInfo; + + /// Ordered list of DAG postprocessing steps. + std::vector> Mutations; + +public: + /// DdgUnit data structure to help building loop carried dependence. + class DdgUnit { + public: + ModuloSchedulerDAG *DAG; + + SUnit *SU; + // DdgUnit node number, it is equal to SU->NodeNum. + unsigned NodeNum; + + // Preds and Succs will include backedges. + SmallVector Preds; + SmallVector Succs; + + DdgUnit(ModuloSchedulerDAG *DAG, SUnit *SU) + : DAG(DAG), SU(SU), NodeNum(SU->NodeNum), Preds(SU->Preds), + Succs(SU->Succs) {} + bool addPred(const SDep &D, DdgUnit *DDef); + }; + + /// This array holds the ddg nodes in the graph; + std::vector DdgUnits; + /// Each SUnit in the DDG block is mapped to an DdgUnit. + DenseMap DdgUnitMap; + + /// Array and number of backedges (edges with distance > 0) in the DDG. + unsigned NumBackEdges = 0; + std::vector> BackEdges; + + // Holds information on an SCC (Strongly Connected Component) of the DDG. + class Scc { + public: + ModuloSchedulerDAG *DAG; + // A bitmap that represents the DdgUnits that are in the SCC. + BitVector Nodes; + int NumNodes; + + // Array and number of backedges in the SCC. + std::vector> BackEdges; + int NumBackEdges; + + public: + Scc(ModuloSchedulerDAG *DAG, int NumNodes) + : DAG(DAG), NumNodes(NumNodes), NumBackEdges(0) {} + void creatScc(BitVector &SccNodes); + }; + + std::set InSccs; + /// Array that holds the SCCs and their number. + std::vector Sccs; + int NumSccs = 0; + + ModuloSchedulerDAG(MachineModuloSched &P, MachineLoop &L, LiveIntervals &lis, + const RegisterClassInfo &rci) + : ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), MII(0), + Scheduled(false), Loop(L), LIS(lis), RegClassInfo(rci) { + P.MF->getSubtarget().getSMSMutations(Mutations); + } + + void schedule() override; + + /// Return true if the loop kernel has been scheduled. + bool hasNewSchedule() { return Scheduled; } + + bool addLoopCarriedDependences(); + DdgUnit *getDdgUnit(SUnit *SU); + bool isBackEdge(DdgUnit *Src, DdgUnit *Dst) { + return Src->NodeNum >= Dst->NodeNum; + } + + void buildSccs(); + void findNodesByOrder(unsigned NodeNum, BitVector &Result, bool Succs); + + void viewGraph(const Twine &Name, const Twine &Title); + void viewGraph(); + +private: + void postprocessDAG(); + void addLoopCarriedRegDeps(); + void createLoopRegDep(SUnit *Use, SDep &Dep); + void addLoopCarriedRegDep(MachineRegisterInfo &MRI, unsigned Reg); + void addLoopCarriedMemDeps(); + void createLoopMemDep(SUnit *Src, SUnit *Dstint, int Latency); + int getLastDefOperand(MachineRegisterInfo &MRI, unsigned Reg, unsigned &Min, + unsigned &Max); + MachineInstr *getSingleDef(unsigned Reg, const MachineRegisterInfo *MRI); + bool computeDelta(MachineInstr &MI, unsigned &Delta); + DdgUnit *newDdgUnit(SUnit *SU); +}; + +} // namespace llvm + +#endif Index: include/llvm/CodeGen/Passes.h =================================================================== --- include/llvm/CodeGen/Passes.h +++ include/llvm/CodeGen/Passes.h @@ -423,6 +423,11 @@ /// This pass performs software pipelining on machine instructions. extern char &MachinePipelinerID; + /// This pass performs modulo scheduling on machine instructions. + /// This is an improved version of the origin machine pipeline pass + /// which will base on non-ssa machine instruction mode. + extern char &MachineModuloSchedID; + /// This pass frees the memory occupied by the MachineFunction. FunctionPass *createFreeMachineFunctionPass(); Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -259,6 +259,7 @@ void initializeMachineModuleInfoPass(PassRegistry&); void initializeMachineOptimizationRemarkEmitterPassPass(PassRegistry&); void initializeMachineOutlinerPass(PassRegistry&); +void initializeMachineModuloSchedPass(PassRegistry&); void initializeMachinePipelinerPass(PassRegistry&); void initializeMachinePostDominatorTreePass(PassRegistry&); void initializeMachineRegionInfoPassPass(PassRegistry&); Index: lib/CodeGen/CMakeLists.txt =================================================================== --- lib/CodeGen/CMakeLists.txt +++ lib/CodeGen/CMakeLists.txt @@ -81,6 +81,7 @@ MachineLoopInfo.cpp MachineModuleInfo.cpp MachineModuleInfoImpls.cpp + MachineModuloSched.cpp MachineOperand.cpp MachineOptimizationRemarkEmitter.cpp MachineOutliner.cpp Index: lib/CodeGen/CodeGen.cpp =================================================================== --- lib/CodeGen/CodeGen.cpp +++ lib/CodeGen/CodeGen.cpp @@ -64,6 +64,7 @@ initializeMachineLICMPass(Registry); initializeMachineLoopInfoPass(Registry); initializeMachineModuleInfoPass(Registry); + initializeMachineModuloSchedPass(Registry); initializeMachineOptimizationRemarkEmitterPassPass(Registry); initializeMachineOutlinerPass(Registry); initializeMachinePipelinerPass(Registry); Index: lib/CodeGen/MachineModuloSched.cpp =================================================================== --- /dev/null +++ lib/CodeGen/MachineModuloSched.cpp @@ -0,0 +1,955 @@ +//===--------MachineModuloSched.cpp ---------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Build the data dependence graph(DDG) for a loop. This allows software +// pipeliner(SWP) or other schedulers using the ddg info to make the scheduling +// codes better. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineModuloSched.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetOpcodes.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/Support/GraphWriter.h" + +using namespace llvm; + +#define DEBUG_TYPE "modulo-sched" + +/// A command line option to turn software pipelining on or off. +static cl::opt EnableSWP("enable-modulo-sched", cl::Hidden, + cl::init(true), cl::ZeroOrMore, + cl::desc("Enable Modulo Software Pipelining")); + +/// A command line option to enable SWP at -Os/-Oz. +static cl::opt EnableSWPOptSize("enable-swp-opt-size", + cl::desc("Enable SWP at Os."), cl::Hidden, + cl::init(false)); + +/// A command line argument to limit the number of stages in the pipeline. +static cl::opt + SWPMaxStages("swp-max-stages", + cl::desc("Maximum stages allowed in the generated scheduled."), + cl::Hidden, cl::init(3)); + +char MachineModuloSched::ID = 0; +char &llvm::MachineModuloSchedID = MachineModuloSched::ID; +INITIALIZE_PASS_BEGIN(MachineModuloSched, "modulo-sched", + "Modulo Software Pipelining", false, false) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_END(MachineModuloSched, "modulo-sched", + "Modulo Software Pipelining", false, false) + +/// The "main" function for implementing machine modulo scheduling. +bool MachineModuloSched::runOnMachineFunction(MachineFunction &mf) { + if (skipFunction(mf.getFunction())) + return false; + + if (!EnableSWP) + return false; + + if (mf.getFunction().getAttributes().hasAttribute( + AttributeList::FunctionIndex, Attribute::OptimizeForSize) && + !EnableSWPOptSize.getPosition()) + return false; + + MF = &mf; + MLI = &getAnalysis(); + MDT = &getAnalysis(); + TII = MF->getSubtarget().getInstrInfo(); + RegClassInfo.runOnMachineFunction(*MF); + + for (auto &L : *MLI) + scheduleLoop(*L); + + return false; +} + +/// Attempt to perform the SWP algorithm on the specified loop. This function is +/// the main entry point for the algorithm. The function identifies candidate +/// loops, calculates the minimum initiation interval, and attempts to schedule +/// the loop. +bool MachineModuloSched::scheduleLoop(MachineLoop &L) { + bool Changed = false; + for (auto &InnerLoop : L) + Changed |= scheduleLoop(*InnerLoop); + + if (!canPipelineLoop(L)) + return Changed; + + Changed = swpModuloScheduler(L); + + return Changed; +} + +/// Return true if the loop can be software pipelined. The algorithm is +/// restricted to loops with a single basic block. Make sure that the +/// branch in the loop can be analyzed. +bool MachineModuloSched::canPipelineLoop(MachineLoop &L) { + if (L.getNumBlocks() != 1) + return false; + + // Check if the branch can't be understood because we can't do pipelining + // if that's the case. + LI.TBB = nullptr; + LI.FBB = nullptr; + LI.BrCond.clear(); + if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) + return false; + + LI.LoopInductionVar = nullptr; + LI.LoopCompare = nullptr; + if (TII->analyzeLoop(L, LI.LoopInductionVar, LI.LoopCompare)) + return false; + + if (!L.getLoopPreheader()) + return false; + + // If any of the Phis contain subregs, then we can't pipeline + // because we don't know how to maintain subreg information in the + // VMap structure. + MachineBasicBlock *MBB = L.getHeader(); + for (MachineBasicBlock::iterator BBI = MBB->instr_begin(), + BBE = MBB->getFirstNonPHI(); + BBI != BBE; ++BBI) + for (unsigned i = 1; i != BBI->getNumOperands(); i += 2) + if (BBI->getOperand(i).getSubReg() != 0) + return false; + + return true; +} + +/// The SWP algorithm consists of the following main steps: +/// 1. Computation and analysis of the dependence graph. +/// 2. Ordering of the nodes (instructions). +/// 3. Attempt to Schedule the loop. +bool MachineModuloSched::swpModuloScheduler(MachineLoop &L) { + assert(L.getBlocks().size() == 1 && "SWP works on single blocks only."); + + LLVM_DEBUG(dbgs() << "Attempt SWP on BB#" << L.getBlocks()[0]->getNumber() + << "\n"); + + ModuloSchedulerDAG SWP(*this, L, getAnalysis(), RegClassInfo); + + MachineBasicBlock *MBB = L.getHeader(); + // The kernel should not include any terminator instructions. These + // will be added back later. + SWP.startBlock(MBB); + + // Compute the number of 'real' instructions in the basic block by + // ignoring terminators. + unsigned size = MBB->size(); + for (MachineBasicBlock::iterator I = MBB->getFirstTerminator(), + E = MBB->instr_end(); + I != E; ++I, --size) + ; + + SWP.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size); + SWP.schedule(); + SWP.exitRegion(); + + SWP.finishBlock(); + return SWP.hasNewSchedule(); +} + +void ModuloSchedulerDAG::postprocessDAG() { + for (auto &M : Mutations) + M->apply(this); +} + +/// Override the schedule function in ScheduleDAGInstrs to implement the +/// scheduling part of the Modulo Scheduling algorithm. +/// Create a data dependence graph(DDG) for the loop. There are two kinds of +/// dependence, inner loop deps and loop carried deps. Modulo scheduler will +/// call buildSchedGraph or buildDAGWithRegPressure to build inner loop deps. +/// And call addLoopCarriedDependences to build loop carried deps. Finally, we +/// can combine the existing DAG diagram, and form a complete data dependency +/// graph(DDG). +void ModuloSchedulerDAG::schedule() { + // Build inner loop dependence for modulo scheduler. + AliasAnalysis *AA = &Pass.getAnalysis().getAAResults(); + buildSchedGraph(AA); + postprocessDAG(); + LLVM_DEBUG(dump()); + + // Add loop carried dependence for modulo scheduler. + if (!addLoopCarriedDependences()) + return; + + // build the strongly connected component graph(SCC) for modulo scheduler. + buildSccs(); + + // TODO: SWP algorithm will be implemented here based on the SCCs result. +} + +/// Add loop carried dependence for registers and memories, and record all the +/// BackEdges. +bool ModuloSchedulerDAG::addLoopCarriedDependences() { + // There is nothing to do for this BB. + if (SUnits.size() <= 1) + return false; + + // We'll be allocating one DdgUnit for each real instruction in the region, + // which is contained within a basic block. + DdgUnits.reserve(SUnits.size()); + + // Map the SUnit to DdgUnit. + for (unsigned i = 0, e = SUnits.size(); i != e; ++i) { + SUnit *SU = &SUnits[i]; + assert(i == SU->NodeNum && i == DdgUnits.size() && "Wrong node number."); + DdgUnit *DU = newDdgUnit(SU); + DdgUnitMap[SU] = DU; + } + + // Add the loop carried dependences for memory and registers. + addLoopCarriedMemDeps(); + addLoopCarriedRegDeps(); + + // Find all the backedges, push them into BackEdges vector. + for (auto &DU : DdgUnits) { + for (unsigned i = 0, e = DU.Succs.size(); i != e; ++i) { + SDep *Dep = &DU.Succs[i]; + DdgUnit *DestDU = getDdgUnit(Dep->getSUnit()); + if (isBackEdge(&DU, DestDU)) + BackEdges.push_back(std::make_pair(Dep, &DU)); + } + } + assert(BackEdges.size() == NumBackEdges && "Wrong number of backedges."); + + return true; +} + +/// Return the underlying objects for the memory references of an instruction. +/// This function calls the code in ValueTracking, but first checks that the +/// instruction has a memory operand. +static void getUnderlyingObjects(MachineInstr *MI, + SmallVectorImpl &Objs, + const DataLayout &DL) { + if (!MI->hasOneMemOperand()) + return; + MachineMemOperand *MM = *MI->memoperands_begin(); + if (!MM->getValue()) + return; + GetUnderlyingObjects(const_cast(MM->getValue()), Objs, DL); +} + +/// Return true if the instruction causes a chain between memory +/// references before and after it. +static bool isDependenceBarrier(MachineInstr &MI, AliasAnalysis *AA) { + return MI.isCall() || MI.hasUnmodeledSideEffects() || + (MI.hasOrderedMemoryRef() && + (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad(AA))); +} + +MachineInstr *ModuloSchedulerDAG::getSingleDef(unsigned Reg, + const MachineRegisterInfo *MRI) { + MachineInstr *Ret = nullptr; + for (MachineInstr &DefMI : MRI->def_instructions(Reg)) { + if (DefMI.getParent() != BB || DefMI.isDebugValue()) + continue; + if (!Ret) + Ret = &DefMI; + else if (Ret != &DefMI) + return nullptr; + } + return Ret; +} + +/// Return true if we can compute the amount the instruction changes +/// during each iteration. Set Delta to the amount of the change. +bool ModuloSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) { + const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo(); + const TargetRegisterInfo *TRI = + BB->getParent()->getSubtarget().getRegisterInfo(); + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + + MachineOperand *BaseOp; + int64_t Offset; + if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI)) + return false; + + if (!BaseOp->isReg()) + return false; + + unsigned BaseReg = BaseOp->getReg(); + + MachineInstr *BaseDef = getSingleDef(BaseReg, &MRI); + if (!BaseDef) + return false; + + int D = 0; + if (!TII->getIncrementValue(*BaseDef, D) && D >= 0) + return false; + + Delta = D; + return true; +} + +/// There are three kinds of loop carried memory dependences: +/// 1) Store-->Load true deps +/// BB: +/// ... +/// Load xxx (a) <-- +/// ... | +/// ... | (True Dep) +/// Store xxx (b) --- +/// ... +/// Branch BB +/// 2) Store-->Store output deps +/// BB: +/// ... +/// Store xxx (a) <-- +/// ... | +/// ... | (Output Dep) +/// Store xxx (b) --- +/// ... +/// Branch BB +/// 3) Load-->Store Anti deps +/// BB: +/// ... +/// Store xxx (a) <-- +/// ... | +/// ... | (Anti Dep) +/// Load xxx (b) --- +/// ... +/// Branch BB +void ModuloSchedulerDAG::addLoopCarriedMemDeps() { + AliasAnalysis *AA = &Pass.getAnalysis().getAAResults(); + const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo(); + const TargetRegisterInfo *TRI = + BB->getParent()->getSubtarget().getRegisterInfo(); + MapVector> PendingLoadsStores; + for (unsigned i = 0, e = SUnits.size(); i != e; ++i) { + SUnit *SUI = &SUnits[i]; + MachineInstr *DstInst = SUI->getInstr(); + if (isDependenceBarrier(*DstInst, AA)) { + PendingLoadsStores.clear(); + // For Barrier dependence, we must add loop carried dependence for them. + for (auto &PI : SUI->Preds) { + if (PI.getKind() != SDep::Order) + continue; + + SUnit *SUJ = PI.getSUnit(); + MachineInstr *SrcInst = SUJ->getInstr(); + createLoopMemDep(SUI, SUJ, + (DstInst->mayStore() && SrcInst->mayLoad()) ? 1 : 0); + } + + for (auto &SI : SUI->Succs) { + if (SI.getKind() != SDep::Order) + continue; + + SUnit *SUJ = SI.getSUnit(); + MachineInstr *SrcInst = SUJ->getInstr(); + createLoopMemDep(SUJ, SUI, + (SrcInst->mayStore() && DstInst->mayLoad()) ? 1 : 0); + } + } + if (!DstInst->mayLoadOrStore()) + continue; + + SmallVector Objs; + getUnderlyingObjects(DstInst, Objs, BB->getParent()->getDataLayout()); + for (auto V : Objs) { + SmallVector &SUs = PendingLoadsStores[V]; + SUs.push_back(SUI); + } + + for (unsigned j = 0; j <= i; j++) { + SUnit *SUJ = &SUnits[j]; + MachineInstr *SrcInst = SUJ->getInstr(); + if (!SrcInst->mayLoadOrStore() || isDependenceBarrier(*SrcInst, AA)) + continue; + + if (SUJ->isSucc(SUI)) { + for (auto &PI : SUI->Preds) { + if (PI.getSUnit() != SUJ) + continue; + if (PI.getKind() != SDep::Order) + continue; + + // TODO: This approach is conservative, the function was similar with + // addLoopCarriedDependences in MachinePipeliner.cpp, we can do more + // accurate analysis for it. + if (SrcInst->mayLoadOrStore() && !SUI->isSucc(SUJ)) + createLoopMemDep(SUI, SUJ, DstInst->mayLoad() ? 0 : 1); + } + } else { + for (auto V : Objs) { + MapVector>::iterator I = + PendingLoadsStores.find(V); + if (I == PendingLoadsStores.end()) + continue; + + bool NeedCrossDep = false; + for (auto SU : I->second) { + MachineInstr *MI = SU->getInstr(); + if (MI != SrcInst) + continue; + if (DstInst->mayLoad() && MI->mayLoad()) + break; + + // First, perform the cheaper check that compares the base reg. + MachineOperand *BaseReg1, *BaseReg2; + int64_t Offset1, Offset2; + bool Legal1 = + TII->getMemOperandWithOffset(*MI, BaseReg1, Offset1, TRI); + bool Legal2 = + TII->getMemOperandWithOffset(*DstInst, BaseReg2, Offset2, TRI); + if (Legal1 && Legal2 && BaseReg1->isIdenticalTo(*BaseReg2)) { + unsigned Delta1, Delta2; + unsigned MaxLoopCarry = (SWPMaxStages + 1); + if (computeDelta(*MI, Delta1) && computeDelta(*DstInst, Delta2)) { + uint64_t Width1 = (*MI->memoperands_begin())->getSize(); + uint64_t Width2 = (*DstInst->memoperands_begin())->getSize(); + // This is the main test, which checks the offset values and the + // loop increment value to determine if the accesses may be loop + // carried. + if ((Delta1 > 0 && + std::abs(Offset2 - Offset1) >= MaxLoopCarry * Delta1) || + ((Offset1 >= Offset2) && (Offset1 + Width1 <= Delta1)) || + ((Offset1 < Offset2) && (Offset2 + Width2 <= Delta2))) + break; + } + } + + // Second, the more expensive check that uses alias analysis on the + // value with MIMO info. + if (!AA) { + NeedCrossDep = true; + break; + } + + MachineMemOperand *MMO1 = *MI->memoperands_begin(); + MachineMemOperand *MMO2 = *DstInst->memoperands_begin(); + if (!MMO1->getValue() || !MMO2->getValue()) { + NeedCrossDep = true; + break; + } + AliasResult AAResult = AA->alias( + MemoryLocation(MMO1->getValue(), MemoryLocation::UnknownSize, + MMO1->getAAInfo()), + MemoryLocation(MMO2->getValue(), MemoryLocation::UnknownSize, + MMO2->getAAInfo())); + if (AAResult != NoAlias) + NeedCrossDep = true; + break; + } + + if (NeedCrossDep) + createLoopMemDep(SUI, SUJ, DstInst->mayLoad() ? 0 : 1); + } + } + } + } +} + +void ModuloSchedulerDAG::createLoopMemDep(SUnit *Src, SUnit *Dst, int Latency) { + if (Src == Dst) + return; + SDep Dep(Src, SDep::MayAliasMem); + Dep.setLatency(Latency); + + DdgUnit *DUse = DdgUnitMap[Dst]; + DdgUnit *DDef = DdgUnitMap[Src]; + if (DUse->addPred(Dep, DDef)) + NumBackEdges++; +} + +void ModuloSchedulerDAG::addLoopCarriedRegDeps() { + const TargetRegisterInfo *TRI = + BB->getParent()->getSubtarget().getRegisterInfo(); + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + + BitVector PRegDefs(TRI->getNumRegs()); + BitVector VRegDefs(MRI.getNumVirtRegs()); + + for (auto &N : SUnits) { + MachineInstr *MI = N.getInstr(); + + for (unsigned i = 0, n = MI->getNumOperands(); i != n; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg() || !MO.isDef()) + continue; + unsigned Reg = MO.getReg(); + + if (TargetRegisterInfo::isPhysicalRegister(Reg)) { + if (PRegDefs[Reg]) + continue; + addLoopCarriedRegDep(MRI, Reg); + PRegDefs.set(Reg); + } else if (TargetRegisterInfo::isVirtualRegister(Reg)) { + unsigned u = TargetRegisterInfo::virtReg2Index(Reg); + if (VRegDefs[u]) + continue; + addLoopCarriedRegDep(MRI, Reg); + VRegDefs.set(u); + } + } + } +} + +int ModuloSchedulerDAG::getLastDefOperand(MachineRegisterInfo &MRI, + unsigned Reg, unsigned &Min, + unsigned &Max) { + int LastOpIdx = -1; + for (MachineOperand &MO : MRI.def_operands(Reg)) { + MachineInstr *MI = MO.getParent(); + SUnit *SU = MISUnitMap[MI]; + if (!SU) + continue; + + if (SU->NodeNum <= Min) + Min = SU->NodeNum; + if (SU->NodeNum >= Max) { + Max = SU->NodeNum; + LastOpIdx = &MO - &MI->getOperand(0); + } + } + + return LastOpIdx; +} + +/// Add loop carried dependence for registers, include physical or virtual +/// registers, with three kinds of deps "True/Anti/Output". +/// The following is the detailed description of the three kinds of loop carried +/// registers dependences: +/// 1) --> true deps +/// BB: +/// ... +/// yyy = Reg (a) <-- +/// ... | +/// Reg = xxx | +/// ... | (True Dep) +/// Reg = zzz (b) --- +/// ... +/// Branch BB +/// 2) --> anti deps +/// BB: +/// ... +/// Reg = xxx (a) <-- +/// ... | +/// Reg = zzz | +/// ... | (Anti Dep) +/// yyy = Reg (b) --- +/// ... +/// Branch BB +/// 3) --> output deps +/// BB: +/// ... +/// Reg = xxx (a) <-- +/// ... | +/// ... | (Output Dep) +/// Reg = zzz (b) --- +/// ... +/// Branch BB +void ModuloSchedulerDAG::addLoopCarriedRegDep(MachineRegisterInfo &MRI, + unsigned Reg) { + unsigned Min = SUnits.size(); + unsigned Max = 0; + int LastOpIdx = -1; + if (TargetRegisterInfo::isPhysicalRegister(Reg)) { + for (MCRegAliasIterator Alias(Reg, TRI, true); Alias.isValid(); ++Alias) { + LastOpIdx = getLastDefOperand(MRI, *Alias, Min, Max); + } + } else + LastOpIdx = getLastDefOperand(MRI, Reg, Min, Max); + + assert(Min <= Max && "Can not find Min/Max DDG Node for inter-loop."); + + MachineInstr *LastDefMI = SUnits[Max].getInstr(); + MachineInstr *FirstDefMI = SUnits[Min].getInstr(); + + bool UseByLastDef = false; + for (MachineOperand &MO : MRI.use_nodbg_operands(Reg)) { + MachineInstr *UseMI = MO.getParent(); + SUnit *SU = MISUnitMap[UseMI]; + if (!SU) + continue; + + if (SU->NodeNum <= Min) { + // First, add true register dependence for last def to the use in the next + // iteration. + unsigned OpNo = &MO - &UseMI->getOperand(0); + SDep Dep = SDep(&SUnits[Max], SDep::Data, Reg); + int Latency = + SchedModel.computeOperandLatency(LastDefMI, LastOpIdx, UseMI, OpNo); + Dep.setLatency(Latency); + createLoopRegDep(SU, Dep); + UseByLastDef = true; + } else if (SU->NodeNum > Max) { + // Second, add anti register dependence for the use after last def to the + // first register def in the next iteration. + SDep Dep = SDep(SU, SDep::Anti, Reg); + createLoopRegDep(&SUnits[Min], Dep); + UseByLastDef = true; + } + } + + // Third, add an output register dependence between last def and the first def + // in the next iteration. + // There are two cases need be avoided, we need check: + // a) Whether a self output dependence(Min == Max). + // b) Is there a register use between last def and first def in the next + // iteration. + if (!UseByLastDef && Min != Max) { + SDep Dep = SDep(&SUnits[Max], SDep::Output, Reg); + Dep.setLatency( + SchedModel.computeOutputLatency(LastDefMI, LastOpIdx, FirstDefMI)); + createLoopRegDep(&SUnits[Min], Dep); + } +} + +void ModuloSchedulerDAG::createLoopRegDep(SUnit *Use, SDep &Dep) { + DdgUnit *DUse = DdgUnitMap[Use]; + DdgUnit *DDef = DdgUnitMap[Dep.getSUnit()]; + if (DUse->addPred(Dep, DDef)) + NumBackEdges++; +} + +/// newDdgUnit - Creates a new DdgUnit and return a ptr to it. +ModuloSchedulerDAG::DdgUnit *ModuloSchedulerDAG::newDdgUnit(SUnit *SU) { +#ifndef NDEBUG + const DdgUnit *Addr = DdgUnits.empty() ? nullptr : &DdgUnits[0]; +#endif + DdgUnits.emplace_back(this, SU); + assert((Addr == nullptr || Addr == &DdgUnits[0]) && + "DdgUnits std::vector reallocated on the fly!"); + return &DdgUnits.back(); +} + +/// addPred - This adds the specified edge as a pred of the current node if +/// not already. It also adds the current node as a successor of the +/// specified node. +bool ModuloSchedulerDAG::DdgUnit::addPred(const SDep &D, DdgUnit *DDef) { + // If this node already has this dependence, don't add a redundant one. + for (SmallVectorImpl::iterator I = Preds.begin(), E = Preds.end(); + I != E; ++I) { + if (I->overlaps(D)) { + // Extend the latency if needed. Equivalent to removePred(I) + addPred(D). + if (I->getLatency() < D.getLatency()) { + // Find the corresponding successor in N. + SDep ForwardD = *I; + ForwardD.setSUnit(this->SU); + for (SmallVectorImpl::iterator II = DDef->Succs.begin(), + EE = DDef->Succs.end(); + II != EE; ++II) { + if (*II == ForwardD) { + II->setLatency(D.getLatency()); + break; + } + } + I->setLatency(D.getLatency()); + } + return false; + } + } + // Now add a corresponding succ to N. + SDep P = D; + P.setSUnit(this->SU); + + Preds.push_back(D); + DDef->Succs.push_back(P); + + return true; +} + +ModuloSchedulerDAG::DdgUnit *ModuloSchedulerDAG::getDdgUnit(SUnit *SU) { + if (DdgUnitMap.find(SU) == DdgUnitMap.end()) + return NULL; + + return DdgUnitMap[SU]; +} + +void ModuloSchedulerDAG::buildSccs() { + unsigned NumNodes = SUnits.size(); + for (unsigned i = 0; i < NumBackEdges; i++) { + SDep *Dep = BackEdges[i].first; + DdgUnit *Src = BackEdges[i].second; + DdgUnit *Dest = getDdgUnit(Dep->getSUnit()); + + if (InSccs.find(Dep) != InSccs.end()) + continue; + + BitVector Start(NumNodes); + BitVector End(NumNodes); + + // Kosaraju's algorithm to find the SCC graph. + findNodesByOrder(Dest->NodeNum, Start, true); + findNodesByOrder(Src->NodeNum, End, false); + Start &= End; + + if (Start.any()) { + Scc SCC(this, NumNodes); + SCC.creatScc(Start); + Sccs.push_back(SCC); + NumSccs++; + } + } + + if (NumSccs == 0) + return; + +#ifndef NDEBUG + // Check that every node in SCCS belongs to exactly one strongly connected + // component and that no element of SCCS is empty. + BitVector Tmp(NumNodes); + for (int i = 0; i < NumSccs; i++) { + assert(Sccs[i].Nodes.any() && "Scc can not be empty."); + // Verify that every node in sccs is + // in exactly one strongly connected component. + assert(!Tmp.anyCommon(Sccs[i].Nodes) && "Sccs can not have common nodes."); + Tmp |= Sccs[i].Nodes; + } + LLVM_DEBUG(dbgs() << "Node numbers for all sccs is " << Tmp.count() + << ", Total Node numbers is " << NumNodes << ".\n"); +#endif +} + +void ModuloSchedulerDAG::findNodesByOrder(unsigned NodeNum, BitVector &Result, + bool Succs) { + unsigned NumNodes = SUnits.size(); + BitVector Worklist(NumNodes); + BitVector Visited(NumNodes); + + Result.set(NodeNum); + Visited.set(NodeNum); + + bool change = true; + while (change) { + change = false; + Worklist.reset(); + Worklist |= Visited; + Visited.reset(); + + for (int u = Worklist.find_first(); u >= 0; u = Worklist.find_next(u)) { + DdgUnit &DU = DdgUnits[u]; + for (unsigned i = 0, e = Succs ? DU.Succs.size() : DU.Preds.size(); + i != e; ++i) { + SDep *Dep = Succs ? &DU.Succs[i] : &DU.Preds[i]; + DdgUnit *DestDU = getDdgUnit(Dep->getSUnit()); + assert(DestDU != NULL && "Can't find pred/succ DDG node."); + int V = DestDU->NodeNum; + + if (!Result[V]) { + Result.set(V); + Visited.set(V); + change = true; + } + } + } + } +} + +void ModuloSchedulerDAG::Scc::creatScc(BitVector &SccNodes) { + Nodes.clear(); + Nodes.resize(NumNodes); + + Nodes |= SccNodes; + + for (int u = SccNodes.find_first(); u >= 0; u = SccNodes.find_next(u)) { + DdgUnit &DU = DAG->DdgUnits[u]; + + for (unsigned i = 0, e = DU.Succs.size(); i != e; ++i) { + SDep *Dep = &DU.Succs[i]; + DdgUnit *DestDU = DAG->getDdgUnit(Dep->getSUnit()); + + if (SccNodes[DestDU->NodeNum]) { + DAG->InSccs.insert(Dep); + if (DAG->isBackEdge(&DU, DestDU)) { + BackEdges.push_back(std::make_pair(Dep, &DU)); + NumBackEdges++; + } + } + } + } +} + +//===----------------------------------------------------------------------===// +// GraphWriter support for Ddg. +//===----------------------------------------------------------------------===// + +#ifndef NDEBUG +namespace llvm { +class DdgNodeIterator + : public std::iterator { + ModuloSchedulerDAG::DdgUnit *Node; + unsigned Operand; + + DdgNodeIterator(ModuloSchedulerDAG::DdgUnit *N, unsigned Op) + : Node(N), Operand(Op) {} + +public: + bool operator==(const DdgNodeIterator &x) const { + return Operand == x.Operand; + } + bool operator!=(const DdgNodeIterator &x) const { return !operator==(x); } + + pointer operator*() const { + SUnit *SU = getDdgEdge()->getSUnit(); + return Node->DAG->getDdgUnit(SU); + } + pointer operator->() const { return operator*(); } + + // Preincrement + DdgNodeIterator &operator++() { + ++Operand; + return *this; + } + // Postincrement + DdgNodeIterator operator++(int) { + DdgNodeIterator tmp = *this; + ++*this; + return tmp; + } + + static DdgNodeIterator begin(ModuloSchedulerDAG::DdgUnit *N) { + return DdgNodeIterator(N, 0); + } + static DdgNodeIterator end(ModuloSchedulerDAG::DdgUnit *N) { + return DdgNodeIterator(N, N->Succs.size()); + } + + unsigned getOperand() const { return Operand; } + const ModuloSchedulerDAG::DdgUnit *getNode() const { return Node; } + + bool isMemDep() const { return getDdgEdge()->isNormalMemoryOrBarrier(); } + SDep::Kind getDepType() const { return getDdgEdge()->getKind(); } + const SDep *getDdgEdge() const { return &Node->Succs[Operand]; } +}; + +template <> struct GraphTraits { + typedef ModuloSchedulerDAG::DdgUnit *NodeRef; + typedef DdgNodeIterator ChildIteratorType; + + static NodeRef getEntryNode(NodeRef A) { return A; } + static ChildIteratorType child_begin(NodeRef N) { + return DdgNodeIterator::begin(N); + } + static ChildIteratorType child_end(NodeRef N) { + return DdgNodeIterator::end(N); + } +}; + +template <> +struct GraphTraits + : public GraphTraits { + typedef pointer_iterator::iterator> + nodes_iterator; + static nodes_iterator nodes_begin(ModuloSchedulerDAG *G) { + return nodes_iterator(G->DdgUnits.begin()); + } + static nodes_iterator nodes_end(ModuloSchedulerDAG *G) { + return nodes_iterator(G->DdgUnits.end()); + } +}; + +template <> +struct DOTGraphTraits : public DefaultDOTGraphTraits { + + DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} + + static std::string getGraphName(const ModuloSchedulerDAG *G) { + return G->MF.getName(); + } + + /// If you want to override the dot attributes printed for a particular + /// edge, override this method. + static std::string getEdgeAttributes(const ModuloSchedulerDAG::DdgUnit *Node, + DdgNodeIterator EI, + const ModuloSchedulerDAG *Graph) { + std::string Str; + raw_string_ostream OS(Str); + + SUnit *SU = EI.getDdgEdge()->getSUnit(); + if (SU->NodeNum <= Node->SU->NodeNum) + OS << "style=dashed, "; + + if (EI.getDepType() == SDep::Data) + OS << "color = black, label = \"true " << EI.getDdgEdge()->getLatency(); + else if (EI.getDepType() == SDep::Anti) + OS << "color = red, label = \"anti " << EI.getDdgEdge()->getLatency(); + else if (EI.getDepType() == SDep::Output) + OS << "color = blue, label = \"output " << EI.getDdgEdge()->getLatency(); + else { + OS << "color = cyan, label = \"mem " << EI.getDdgEdge()->getLatency(); + } + + if (EI.isMemDep()) + OS << " M\""; + else + OS << " R\""; + return OS.str(); + } + + static std::string getNodeLabel(const ModuloSchedulerDAG::DdgUnit *Node, + const ModuloSchedulerDAG *G) { + std::string Str; + raw_string_ostream OS(Str); + MachineBasicBlock *BB = Node->SU->getInstr()->getParent(); + + if (Node->SU != nullptr) + OS << "N:" << Node->SU->NodeNum; + else + OS << "BB#" << BB->getNumber(); + + return OS.str(); + } + static std::string getNodeDescription(const ModuloSchedulerDAG::DdgUnit *N, + const ModuloSchedulerDAG *G) { + std::string Str; + raw_string_ostream OS(Str); + + if (N->SU != nullptr) + N->SU->getInstr()->print(OS, /*SkipOpers=*/true); + else + OS << "Total insn: " << G->DdgUnits.size(); + + return OS.str(); + } + + static std::string getNodeAttributes(const ModuloSchedulerDAG::DdgUnit *N, + const ModuloSchedulerDAG *G) { + if (N->SU != nullptr) + return "shape=Mrecord"; + else + return "style = filled"; + } +}; +} // namespace llvm +#endif // NDEBUG + +/// viewGraph - Pop up a ghostview window with the reachable parts of the DAG +/// rendered using 'dot'. +/// +void ModuloSchedulerDAG::viewGraph(const Twine &Name, const Twine &Title) { +#ifndef NDEBUG + ViewGraph(this, Name, false, Title); +#else + errs() << "Ddg::viewGraph is only available in debug builds on " + << "systems with Graphviz or gv!\n"; +#endif // NDEBUG +} + +/// Out-of-line implementation with no arguments is handy for gdb. +void ModuloSchedulerDAG::viewGraph() { + viewGraph(BB->getFullName(), "DDG Graph for " + BB->getName()); +}