Index: include/llvm/CodeGen/PseudoSourceValue.h =================================================================== --- include/llvm/CodeGen/PseudoSourceValue.h +++ include/llvm/CodeGen/PseudoSourceValue.h @@ -22,6 +22,8 @@ class raw_ostream; raw_ostream &operator<<(raw_ostream &OS, const MachineMemOperand &MMO); + class PseudoSourceValue; + raw_ostream &operator<<(raw_ostream &OS, const PseudoSourceValue* PSV); /// PseudoSourceValue - Special value supplied for machine level alias /// analysis. It indicates that a memory access references the functions @@ -31,6 +33,8 @@ private: friend raw_ostream &llvm::operator<<(raw_ostream &OS, const MachineMemOperand &MMO); + friend raw_ostream &llvm::operator<<(raw_ostream &OS, + const PseudoSourceValue* PSV); /// printCustom - Implement printing for PseudoSourceValue. This is called /// from Value::print or Value's operator<<. Index: include/llvm/CodeGen/ScheduleDAG.h =================================================================== --- include/llvm/CodeGen/ScheduleDAG.h +++ include/llvm/CodeGen/ScheduleDAG.h @@ -413,6 +413,17 @@ /// specified node. bool addPred(const SDep &D, bool Required = true); + /// addPredBarrier - This adds a barrier edge to SU by calling + /// addPred(), with latency 0 generally or latency 1 for a store + /// followed by a load. + bool addPredBarrier(SUnit *SU, unsigned latency = 0) { + SDep Dep(SU, SDep::Barrier); + unsigned TrueMemOrderLatency = + ((SU->getInstr()->mayStore() && this->getInstr()->mayLoad()) ? 1 : 0); + Dep.setLatency(TrueMemOrderLatency); + return addPred(Dep); + } + /// removePred - This removes the specified edge as a pred of the current /// node if it exists. It also removes the current node as a successor of /// the specified node. Index: include/llvm/CodeGen/ScheduleDAGInstrs.h =================================================================== --- include/llvm/CodeGen/ScheduleDAGInstrs.h +++ include/llvm/CodeGen/ScheduleDAGInstrs.h @@ -15,12 +15,14 @@ #ifndef LLVM_CODEGEN_SCHEDULEDAGINSTRS_H #define LLVM_CODEGEN_SCHEDULEDAGINSTRS_H +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/SparseMultiSet.h" #include "llvm/ADT/SparseSet.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/TargetSchedule.h" #include "llvm/Support/Compiler.h" #include "llvm/Target/TargetRegisterInfo.h" +#include namespace llvm { class MachineFrameInfo; @@ -71,6 +73,10 @@ /// vreg use list. typedef SparseMultiSet VReg2UseMap; + typedef PointerUnion ValueType; + typedef SmallVector, 4> + UnderlyingObjectsVector; + /// ScheduleDAGInstrs - A ScheduleDAG subclass for scheduling lists of /// MachineInstrs. class ScheduleDAGInstrs : public ScheduleDAG { @@ -135,10 +141,67 @@ /// Track the last instruction in this region defining each virtual register. VReg2SUnitMap VRegDefs; - /// PendingLoads - Remember where unknown loads are after the most recent - /// unknown store, as we iterate. As with Defs and Uses, this is here - /// to minimize construction/destruction. - std::vector PendingLoads; + AliasAnalysis *AAForDep; + + /// Remember where a generic side-effecting instruction is as we + /// procede. No SU gets ever scheduled around the BarrierChain + /// (except when a huge scheduling region has been reduced by + /// inserting a barrier chain far below current instruction). + SUnit *BarrierChain; + + public: + + /// A list of SUnits, used in Value2SUsMap, during DAG construction. + /// std::list seems faster than SmallVector, and SparseSet is not + /// applicaple. TODO: What is actually needed is a singly-linked list + /// with a memory pool. + typedef std::list SUList; + protected: + /// A map from ValueType to SUList, used during DAG construction, + /// as a means of remembering which SUs depend on which memory + /// locations. + class Value2SUsMap; + + /// Remove in FIFO order some SUs from huge maps. + void reduceHugeMemNodeMaps(Value2SUsMap &stores, + Value2SUsMap &loads, unsigned N); + + /// Add a chain edge between SUa and SUb, but only if both AliasAnalysis + /// and Target fail to deny the dependency. + void addChainDependency(SUnit *SUa, SUnit *SUb, + unsigned Latency = 0); + + /// Add dependencies as needed from all SUs in list to SU. + void addChainDependencies(SUnit *SU, SUList &sus, unsigned Latency) { + for (auto *su : sus) + addChainDependency(SU, su, Latency); + } + + /// Add dependencies as needed from all SUs in map, to SU. + void addChainDependencies(SUnit *SU, Value2SUsMap &Val2SUsMap); + + /// Add dependencies as needed to SU, from all SUs mapped to V. + void addChainDependencies(SUnit *SU, Value2SUsMap &Val2SUsMap, + ValueType V); + + /// Add barrier chain edges from all SUs in map, and then clear + /// the map. This is equivalent to insertBarrierChain(), but + /// optimized for the common case where the new BarrierChain (a + /// global memory object) has a higher NodeNum than all SUs in + /// map. It is assumed BarrierChain has been set before calling + /// this. + void addBarrierChain(Value2SUsMap &map); + + /// Insert a barrier chain in a huge region, far below current + /// SU. Add barrier chain edges from all SUs in map with higher + /// NodeNums than this new BarrierChain, and remove them from + /// map. If BarrierChain is present in a list, it will also be + /// removed. It is assumed BarrierChain has been set before + /// calling this. + void insertBarrierChain(Value2SUsMap &map); + + /// For an unanalyzable memory access, this Value is used in maps. + UndefValue *UnknownValue; /// DbgValues - Remember instruction that precedes DBG_VALUE. /// These are generated by buildSchedGraph but persist so they can be Index: lib/CodeGen/ScheduleDAGInstrs.cpp =================================================================== --- lib/CodeGen/ScheduleDAGInstrs.cpp +++ lib/CodeGen/ScheduleDAGInstrs.cpp @@ -13,7 +13,6 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/ScheduleDAGInstrs.h" -#include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -27,6 +26,8 @@ #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/RegisterPressure.h" #include "llvm/CodeGen/ScheduleDFS.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Type.h" #include "llvm/IR/Operator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -49,13 +50,39 @@ static cl::opt UseTBAA("use-tbaa-in-sched-mi", cl::Hidden, cl::init(true), cl::desc("Enable use of TBAA during MI DAG construction")); +// When Stores and Loads maps (or NonAliasStores and NonAliasLoads) +// together hold this many SUs, a reduction of maps will be done. +static cl::opt HugeRegion("dag-maps-huge-region", cl::Hidden, + cl::init(1000), cl::desc("The limit to use while constructing the DAG " + "prior to scheduling, at which point a trade-off " + "is made to avoid excessive compile time.")); + +static cl::opt ReductionSize("dag-maps-reduction-size", cl::Hidden, + cl::init(500), cl::desc("A huge scheduling region will have maps reduced " + "by this many nodes at a time.")); + +static void dumpSUList(ScheduleDAGInstrs::SUList &L) { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + dbgs() << "{ "; + for (auto *su : L) { + dbgs() << "SU(" << su->NodeNum << ")"; + if (su != L.back()) + dbgs() << ", "; + } + dbgs() << "}\n"; +#endif +} + ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf, const MachineLoopInfo *mli, bool IsPostRAFlag, bool RemoveKillFlags, LiveIntervals *lis) : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()), LIS(lis), IsPostRA(IsPostRAFlag), RemoveKillFlags(RemoveKillFlags), - CanHandleTerminators(false), FirstDbgValue(nullptr) { + CanHandleTerminators(false), AAForDep(nullptr), BarrierChain(nullptr), + UnknownValue(UndefValue::get( + Type::getVoidTy(mf.getFunction()->getContext()))), + FirstDbgValue(nullptr) { assert((IsPostRA || LIS) && "PreRA scheduling requires LiveIntervals"); DbgValues.clear(); assert(!(IsPostRA && MRI.getNumVirtRegs()) && @@ -124,10 +151,6 @@ } while (!Working.empty()); } -typedef PointerUnion ValueType; -typedef SmallVector, 4> -UnderlyingObjectsVector; - /// getUnderlyingObjectsForInstr - If this machine instr has memory reference /// information and it can be tracked to a normal reference to a known /// object, return the Value for that object. @@ -493,41 +516,36 @@ return true; } - const Value *V = (*MI->memoperands_begin())->getValue(); - if (!V) + if ((*MI->memoperands_begin())->getValue() == nullptr) return true; - SmallVector Objs; - getUnderlyingObjects(V, Objs, DL); - for (Value *V : Objs) { - // Does this pointer refer to a distinct and identifiable object? - if (!isIdentifiedObject(V)) - return true; - } - return false; } /// This returns true if the two MIs need a chain edge betwee them. -/// If these are not even memory operations, we still may need -/// chain deps between them. The question really is - could -/// these two MIs be reordered during scheduling from memory dependency -/// point of view. +/// This is called on normal stores and loads. static bool MIsNeedChainEdge(AliasAnalysis *AA, const MachineFrameInfo *MFI, const DataLayout &DL, MachineInstr *MIa, MachineInstr *MIb) { const MachineFunction *MF = MIa->getParent()->getParent(); const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); - // Cover a trivial case - no edge is need to itself. - if (MIa == MIb) - return false; - + assert ((MIa->mayStore() || MIb->mayStore()) && + "Dependency checked between two loads"); + + // buildSchedGraph() will clear list of stores if not using AA, + // which means all stores have to be chained without AA. + if (!AA && MIa->mayStore() && MIb->mayStore()) + return true; + // Let the target decide if memory accesses cannot possibly overlap. - if ((MIa->mayLoad() || MIa->mayStore()) && - (MIb->mayLoad() || MIb->mayStore())) - if (TII->areMemAccessesTriviallyDisjoint(MIa, MIb, AA)) - return false; + if (TII->areMemAccessesTriviallyDisjoint(MIa, MIb, AA)) + return false; + + if (!AA) + return true; + + // To this point analysis is generic. From here on we do need AA. // FIXME: Need to handle multiple memory operands to support all targets. if (!MIa->hasOneMemOperand() || !MIb->hasOneMemOperand()) @@ -536,15 +554,6 @@ if (isUnsafeMemoryObject(MIa, MFI, DL) || isUnsafeMemoryObject(MIb, MFI, DL)) return true; - // If we are dealing with two "normal" loads, we do not need an edge - // between them - they could be reordered. - if (!MIa->mayStore() && !MIb->mayStore()) - return false; - - // To this point analysis is generic. From here on we do need AA. - if (!AA) - return true; - MachineMemOperand *MMOa = *MIa->memoperands_begin(); MachineMemOperand *MMOb = *MIb->memoperands_begin(); @@ -583,106 +592,17 @@ return (AAResult != AliasAnalysis::NoAlias); } -/// This recursive function iterates over chain deps of SUb looking for -/// "latest" node that needs a chain edge to SUa. -static unsigned iterateChainSucc(AliasAnalysis *AA, const MachineFrameInfo *MFI, - const DataLayout &DL, SUnit *SUa, SUnit *SUb, - SUnit *ExitSU, unsigned *Depth, - SmallPtrSetImpl &Visited) { - if (!SUa || !SUb || SUb == ExitSU) - return *Depth; - - // Remember visited nodes. - if (!Visited.insert(SUb).second) - return *Depth; - // If there is _some_ dependency already in place, do not - // descend any further. - // TODO: Need to make sure that if that dependency got eliminated or ignored - // for any reason in the future, we would not violate DAG topology. - // Currently it does not happen, but makes an implicit assumption about - // future implementation. - // - // Independently, if we encounter node that is some sort of global - // object (like a call) we already have full set of dependencies to it - // and we can stop descending. - if (SUa->isSucc(SUb) || - isGlobalMemoryObject(AA, SUb->getInstr())) - return *Depth; - - // If we do need an edge, or we have exceeded depth budget, - // add that edge to the predecessors chain of SUb, - // and stop descending. - if (*Depth > 200 || - MIsNeedChainEdge(AA, MFI, DL, SUa->getInstr(), SUb->getInstr())) { - SUb->addPred(SDep(SUa, SDep::MayAliasMem)); - return *Depth; - } - // Track current depth. - (*Depth)++; - // Iterate over memory dependencies only. - for (SUnit::const_succ_iterator I = SUb->Succs.begin(), E = SUb->Succs.end(); - I != E; ++I) - if (I->isNormalMemoryOrBarrier()) - iterateChainSucc(AA, MFI, DL, SUa, I->getSUnit(), ExitSU, Depth, Visited); - return *Depth; -} - -/// This function assumes that "downward" from SU there exist -/// tail/leaf of already constructed DAG. It iterates downward and -/// checks whether SU can be aliasing any node dominated -/// by it. -static void adjustChainDeps(AliasAnalysis *AA, const MachineFrameInfo *MFI, - const DataLayout &DL, SUnit *SU, SUnit *ExitSU, - std::set &CheckList, - unsigned LatencyToLoad) { - if (!SU) - return; - - SmallPtrSet Visited; - unsigned Depth = 0; - - for (std::set::iterator I = CheckList.begin(), IE = CheckList.end(); - I != IE; ++I) { - if (SU == *I) - continue; - if (MIsNeedChainEdge(AA, MFI, DL, SU->getInstr(), (*I)->getInstr())) { - SDep Dep(SU, SDep::MayAliasMem); - Dep.setLatency(((*I)->getInstr()->mayLoad()) ? LatencyToLoad : 0); - (*I)->addPred(Dep); - } - - // Iterate recursively over all previously added memory chain - // successors. Keep track of visited nodes. - for (SUnit::const_succ_iterator J = (*I)->Succs.begin(), - JE = (*I)->Succs.end(); J != JE; ++J) - if (J->isNormalMemoryOrBarrier()) - iterateChainSucc(AA, MFI, DL, SU, J->getSUnit(), ExitSU, &Depth, - Visited); - } -} - /// Check whether two objects need a chain edge, if so, add it /// otherwise remember the rejected SU. -static inline void addChainDependency(AliasAnalysis *AA, - const MachineFrameInfo *MFI, - const DataLayout &DL, SUnit *SUa, - SUnit *SUb, std::set &RejectList, - unsigned TrueMemOrderLatency = 0, - bool isNormalMemory = false) { +void ScheduleDAGInstrs::addChainDependency (SUnit *SUa, SUnit *SUb, + unsigned Latency) { // If this is a false dependency, // do not add the edge, but rememeber the rejected node. - if (MIsNeedChainEdge(AA, MFI, DL, SUa->getInstr(), SUb->getInstr())) { - SDep Dep(SUa, isNormalMemory ? SDep::MayAliasMem : SDep::Barrier); - Dep.setLatency(TrueMemOrderLatency); + if (MIsNeedChainEdge(AAForDep, MFI, *TM.getDataLayout(), SUa->getInstr(), SUb->getInstr())) { + SDep Dep(SUa, SDep::MayAliasMem); + Dep.setLatency(Latency); SUb->addPred(Dep); } - else { - // Duplicate entries should be ignored. - RejectList.insert(SUb); - DEBUG(dbgs() << "\tReject chain dep between SU(" - << SUa->NodeNum << ") and SU(" - << SUb->NodeNum << ")\n"); - } } /// Create an SUnit for each real instruction, numbered in top-down toplological @@ -743,6 +663,122 @@ } } +class ScheduleDAGInstrs::Value2SUsMap : public MapVector { + + /// Current total number of SUs in map. + unsigned NumNodes; + + /// 1 for loads, 0 for stores. (see comment in SUList) + unsigned TrueMemOrderLatency; +public: + + Value2SUsMap(unsigned lat = 0) : NumNodes(0), TrueMemOrderLatency(lat) {} + + /// To keep NumNodes up to date, insert() is used instead of + /// this operator w/ push_back(). + ValueType &operator[](const SUList &Key) { + llvm_unreachable("Don't use. Use insert() instead."); }; + + /// Add SU to the SUList of V. If Map grows huge, reduce its size + /// by calling reduce(). + void inline insert(SUnit *SU, ValueType V) { + MapVector::operator[](V).push_back(SU); + NumNodes++; + } + + /// Clears the list of SUs mapped to V. + void inline clearList(ValueType V) { + iterator Itr = find(V); + if (Itr != end()) { + assert (NumNodes >= Itr->second.size()); + NumNodes -= Itr->second.size(); + + Itr->second.clear(); + } + } + + /// Clears map from all contents. + void clear() { + MapVector::clear(); + NumNodes = 0; + } + + unsigned inline size() const { return NumNodes; } + + /// Count the number of SUs in this map after a reduction. + void reComputeSize(void) { + NumNodes = 0; + for (auto &I : *this) + NumNodes += I.second.size(); + } + + unsigned inline getTrueMemOrderLatency() const { + return TrueMemOrderLatency; + } + + void dump(); +}; + +void ScheduleDAGInstrs::addChainDependencies(SUnit *SU, + Value2SUsMap &Val2SUsMap) { + for (auto &I : Val2SUsMap) + addChainDependencies(SU, I.second, + Val2SUsMap.getTrueMemOrderLatency()); +} + +void ScheduleDAGInstrs::addChainDependencies(SUnit *SU, + Value2SUsMap &Val2SUsMap, + ValueType V) { + Value2SUsMap::iterator Itr = Val2SUsMap.find(V); + if (Itr != Val2SUsMap.end()) + addChainDependencies(SU, Itr->second, + Val2SUsMap.getTrueMemOrderLatency()); +} + +void ScheduleDAGInstrs::addBarrierChain(Value2SUsMap &map) { + assert (BarrierChain != nullptr); + + for (auto &I : map) { + SUList &sus = I.second; + for (auto *SU : sus) + SU->addPredBarrier(BarrierChain); + } + map.clear(); +} + +void ScheduleDAGInstrs::insertBarrierChain(Value2SUsMap &map) { + assert (BarrierChain != nullptr); + + // Go through all lists of SUs. + for (Value2SUsMap::iterator I = map.begin(), EE = map.end(); I != EE;) { + Value2SUsMap::iterator CurrItr = I++; + SUList &sus = CurrItr->second; + SUList::iterator SUItr = sus.begin(), SUEE = sus.end(); + for (; SUItr != SUEE; ++SUItr) { + // Stop on BarrierChain or any instruction above it. + if ((*SUItr)->NodeNum <= BarrierChain->NodeNum) + break; + + (*SUItr)->addPredBarrier(BarrierChain); + } + + // Remove also the BarrierChain from list if present. + if (*SUItr == BarrierChain) + SUItr++; + + // Remove all SUs that are now successors of BarrierChain. + if (SUItr != sus.begin()) + sus.erase(sus.begin(), SUItr); + } + + // Remove all entries with empty su lists. + map.remove_if([&](std::pair &mapEntry) { + return (mapEntry.second.empty()); }); + + // Recompute the size of the map (NumNodes). + map.reComputeSize(); +} + /// If RegPressure is non-null, compute register pressure as a side effect. The /// DAG builder is an efficient place to do it because it already visits /// operands. @@ -752,7 +788,9 @@ const TargetSubtargetInfo &ST = MF.getSubtarget(); bool UseAA = EnableAASchedMI.getNumOccurrences() > 0 ? EnableAASchedMI : ST.useAA(); - AliasAnalysis *AAForDep = UseAA ? AA : nullptr; + AAForDep = UseAA ? AA : nullptr; + + BarrierChain = nullptr; MISUnitMap.clear(); ScheduleDAG::clearDAG(); @@ -763,19 +801,30 @@ if (PDiffs) PDiffs->init(SUnits.size()); - // We build scheduling units by walking a block's instruction list from bottom - // to top. - - // Remember where a generic side-effecting instruction is as we procede. - SUnit *BarrierChain = nullptr, *AliasChain = nullptr; - - // Memory references to specific known memory locations are tracked - // so that they can be given more precise dependencies. We track - // separately the known memory locations that may alias and those - // that are known not to alias - MapVector > AliasMemDefs, NonAliasMemDefs; - MapVector > AliasMemUses, NonAliasMemUses; - std::set RejectMemNodes; + // We build scheduling units by walking a block's instruction list + // from bottom to top. + + // Each MIs' memory operand(s) is analyzed to a list of underlying + // objects. The SU is then inserted in the SUList(s) mapped from + // that Value(s). Each Value thus gets mapped to a list of SUs + // depending on it, defs and uses kept separately. Two SUs are + // non-aliasing to each other if they depend on different Values + // exclusively. + Value2SUsMap Stores, Loads(1 /*TrueMemOrderLatency*/); + + // Certain memory accesses are known to not alias any SU in Stores + // or Loads, and have therefore their own 'NonAlias' + // domain. E.g. spill / reload instructions never alias LLVM I/R + // Values. It is assumed that this type of memory accesses always + // have a proper memory operand modelling, and are therefore never + // unanalyzable. This means they are non aliasing against all nodes + // in Stores and Loads, including the unanalyzable ones. + Value2SUsMap NonAliasStores, NonAliasLoads(1 /*TrueMemOrderLatency*/); + + // Always reduce a huge region with half of the elements, except + // when user sets this number explicitly. + if (ReductionSize.getNumOccurrences() == 0) + ReductionSize = (HugeRegion / 2); // Remove any stale debug info; sometimes BuildSchedGraph is called again // without emitting the info from the previous call. @@ -857,219 +906,214 @@ ExitSU.addPred(Dep); } - // Add chain dependencies. - // Chain dependencies used to enforce memory order should have - // latency of 0 (except for true dependency of Store followed by - // aliased Load... we estimate that with a single cycle of latency - // assuming the hardware will bypass) - // Note that isStoreToStackSlot and isLoadFromStackSLot are not usable + // Add memory dependencies. + // Note: isStoreToStackSlot and isLoadFromStackSLot are not usable // after stack slots are lowered to actual addresses. - // TODO: Use an AliasAnalysis and do real alias-analysis queries, and - // produce more precise dependence information. - unsigned TrueMemOrderLatency = MI->mayStore() ? 1 : 0; if (isGlobalMemoryObject(AA, MI)) { - // Be conservative with these and add dependencies on all memory - // references, even those that are known to not alias. - for (MapVector >::iterator I = - NonAliasMemDefs.begin(), E = NonAliasMemDefs.end(); I != E; ++I) { - for (unsigned i = 0, e = I->second.size(); i != e; ++i) { - I->second[i]->addPred(SDep(SU, SDep::Barrier)); - } - } - for (MapVector >::iterator I = - NonAliasMemUses.begin(), E = NonAliasMemUses.end(); I != E; ++I) { - for (unsigned i = 0, e = I->second.size(); i != e; ++i) { - SDep Dep(SU, SDep::Barrier); - Dep.setLatency(TrueMemOrderLatency); - I->second[i]->addPred(Dep); - } - } - // Add SU to the barrier chain. + // This is a barrier event that acts as a pivotal node in the DAG. + + // Become the barrier chain. if (BarrierChain) - BarrierChain->addPred(SDep(SU, SDep::Barrier)); + BarrierChain->addPredBarrier(SU); BarrierChain = SU; - // This is a barrier event that acts as a pivotal node in the DAG, - // so it is safe to clear list of exposed nodes. - adjustChainDeps(AA, MFI, *TM.getDataLayout(), SU, &ExitSU, RejectMemNodes, - TrueMemOrderLatency); - RejectMemNodes.clear(); - NonAliasMemDefs.clear(); - NonAliasMemUses.clear(); - - // fall-through - new_alias_chain: - // Chain all possibly aliasing memory references through SU. - if (AliasChain) { - unsigned ChainLatency = 0; - if (AliasChain->getInstr()->mayLoad()) - ChainLatency = TrueMemOrderLatency; - addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU, AliasChain, - RejectMemNodes, ChainLatency); - } - AliasChain = SU; - for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k) - addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU, - PendingLoads[k], RejectMemNodes, - TrueMemOrderLatency); - for (MapVector >::iterator I = - AliasMemDefs.begin(), E = AliasMemDefs.end(); I != E; ++I) { - for (unsigned i = 0, e = I->second.size(); i != e; ++i) - addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU, - I->second[i], RejectMemNodes); - } - for (MapVector >::iterator I = - AliasMemUses.begin(), E = AliasMemUses.end(); I != E; ++I) { - for (unsigned i = 0, e = I->second.size(); i != e; ++i) - addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU, - I->second[i], RejectMemNodes, TrueMemOrderLatency); - } - adjustChainDeps(AA, MFI, *TM.getDataLayout(), SU, &ExitSU, RejectMemNodes, - TrueMemOrderLatency); - PendingLoads.clear(); - AliasMemDefs.clear(); - AliasMemUses.clear(); - } else if (MI->mayStore()) { - // Add dependence on barrier chain, if needed. - // There is no point to check aliasing on barrier event. Even if - // SU and barrier _could_ be reordered, they should not. In addition, - // we have lost all RejectMemNodes below barrier. - if (BarrierChain) - BarrierChain->addPred(SDep(SU, SDep::Barrier)); - UnderlyingObjectsVector Objs; - getUnderlyingObjectsForInstr(MI, MFI, Objs, *TM.getDataLayout()); + DEBUG(dbgs() << "Global memory object and new barrier chain: SU(" << BarrierChain->NodeNum + << ").\n";); + + // Add dependencies against everything below it and clear maps. + addBarrierChain(Stores); + addBarrierChain(Loads); + addBarrierChain(NonAliasStores); + addBarrierChain(NonAliasLoads); + + continue; + } + + // If it's not a store or a variant load, we're done. + if (!MI->mayStore() && !(MI->mayLoad() && !MI->isInvariantLoad(AA))) + continue; + + // Always add dependecy edge to BarrierChain if present. + if (BarrierChain) + BarrierChain->addPredBarrier(SU); + // Find the underlying objects for MI. The Objs vector is either + // empty, or filled with the Values of memory locations wich this + // SU depends on. An empty vector means the memory location is + // unknown, and may alias anything except NonAlias nodes. Such + // nodes are mapped to 'UnknownValue'. + UnderlyingObjectsVector Objs; + getUnderlyingObjectsForInstr(MI, MFI, Objs, *TM.getDataLayout()); + + if (MI->mayStore()) { if (Objs.empty()) { - // Treat all other stores conservatively. - goto new_alias_chain; + // An unknown store depends on all stores and loads, except + // NonAliasStores and NonAliasLoads. + addChainDependencies(SU, Stores); + addChainDependencies(SU, Loads); + + // If we're not using AA, clear Stores map since all stores + // will be chained. + if (!AAForDep) + Stores.clear(); + + Stores.insert(SU, UnknownValue); + continue; } + // Add precise dependencies against all previously seen memory + // accesses mapped to the same Value(s). bool MayAlias = false; - for (UnderlyingObjectsVector::iterator K = Objs.begin(), KE = Objs.end(); - K != KE; ++K) { - ValueType V = K->getPointer(); - bool ThisMayAlias = K->getInt(); + for (auto &underlObj : Objs) { + ValueType V = underlObj.getPointer(); + bool ThisMayAlias = underlObj.getInt(); if (ThisMayAlias) MayAlias = true; - // A store to a specific PseudoSourceValue. Add precise dependencies. - // Record the def in MemDefs, first adding a dep if there is - // an existing def. - MapVector >::iterator I = - ((ThisMayAlias) ? AliasMemDefs.find(V) : NonAliasMemDefs.find(V)); - MapVector >::iterator IE = - ((ThisMayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end()); - if (I != IE) { - for (unsigned i = 0, e = I->second.size(); i != e; ++i) - addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU, - I->second[i], RejectMemNodes, 0, true); - - // If we're not using AA, then we only need one store per object. - if (!AAForDep) - I->second.clear(); - I->second.push_back(SU); - } else { - if (ThisMayAlias) { - if (!AAForDep) - AliasMemDefs[V].clear(); - AliasMemDefs[V].push_back(SU); - } else { - if (!AAForDep) - NonAliasMemDefs[V].clear(); - NonAliasMemDefs[V].push_back(SU); - } - } - // Handle the uses in MemUses, if there are any. - MapVector >::iterator J = - ((ThisMayAlias) ? AliasMemUses.find(V) : NonAliasMemUses.find(V)); - MapVector >::iterator JE = - ((ThisMayAlias) ? AliasMemUses.end() : NonAliasMemUses.end()); - if (J != JE) { - for (unsigned i = 0, e = J->second.size(); i != e; ++i) - addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU, - J->second[i], RejectMemNodes, - TrueMemOrderLatency, true); - J->second.clear(); - } + Value2SUsMap &stores_ = (ThisMayAlias ? Stores : NonAliasStores); + + // Add dependencies to previous stores and loads mapped to V. + addChainDependencies(SU, stores_, V); + addChainDependencies(SU, (ThisMayAlias ? Loads : NonAliasLoads), V); + + // If we're not using AA, then we only need one store per object. + if (!AAForDep) + stores_.clearList(V); + + // Map this store to V. + stores_.insert(SU, V); } if (MayAlias) { - // Add dependencies from all the PendingLoads, i.e. loads - // with no underlying object. - for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k) - addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU, - PendingLoads[k], RejectMemNodes, - TrueMemOrderLatency); - // Add dependence on alias chain, if needed. - if (AliasChain) - addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU, AliasChain, - RejectMemNodes); + // The store is not 'NonAlias', and may therefore have + // dependencies to unanalyzable loads and stores. + addChainDependencies(SU, Loads, UnknownValue); + addChainDependencies(SU, Stores, UnknownValue); + } + } + else { // SU is a load. + if (Objs.empty()) { + // An unknown load depends on all stores, except NonAliasStores. + addChainDependencies(SU, Stores); + + Loads.insert(SU, UnknownValue); + continue; } - adjustChainDeps(AA, MFI, *TM.getDataLayout(), SU, &ExitSU, RejectMemNodes, - TrueMemOrderLatency); - } else if (MI->mayLoad()) { - bool MayAlias = true; - if (MI->isInvariantLoad(AA)) { - // Invariant load, no chain dependencies needed! - } else { - UnderlyingObjectsVector Objs; - getUnderlyingObjectsForInstr(MI, MFI, Objs, *TM.getDataLayout()); - - if (Objs.empty()) { - // A load with no underlying object. Depend on all - // potentially aliasing stores. - for (MapVector >::iterator I = - AliasMemDefs.begin(), E = AliasMemDefs.end(); I != E; ++I) - for (unsigned i = 0, e = I->second.size(); i != e; ++i) - addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU, - I->second[i], RejectMemNodes); - - PendingLoads.push_back(SU); + + bool MayAlias = false; + for (auto &underlObj : Objs) { + ValueType V = underlObj.getPointer(); + bool ThisMayAlias = underlObj.getInt(); + if (ThisMayAlias) MayAlias = true; - } else { - MayAlias = false; - } - for (UnderlyingObjectsVector::iterator - J = Objs.begin(), JE = Objs.end(); J != JE; ++J) { - ValueType V = J->getPointer(); - bool ThisMayAlias = J->getInt(); - - if (ThisMayAlias) - MayAlias = true; - - // A load from a specific PseudoSourceValue. Add precise dependencies. - MapVector >::iterator I = - ((ThisMayAlias) ? AliasMemDefs.find(V) : NonAliasMemDefs.find(V)); - MapVector >::iterator IE = - ((ThisMayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end()); - if (I != IE) - for (unsigned i = 0, e = I->second.size(); i != e; ++i) - addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU, - I->second[i], RejectMemNodes, 0, true); - if (ThisMayAlias) - AliasMemUses[V].push_back(SU); - else - NonAliasMemUses[V].push_back(SU); - } - if (MayAlias) - adjustChainDeps(AA, MFI, *TM.getDataLayout(), SU, &ExitSU, - RejectMemNodes, /*Latency=*/0); - // Add dependencies on alias and barrier chains, if needed. - if (MayAlias && AliasChain) - addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU, AliasChain, - RejectMemNodes); - if (BarrierChain) - BarrierChain->addPred(SDep(SU, SDep::Barrier)); + // Add precise dependencies against all previously seen stores + // mapping to the same Value(s). + addChainDependencies(SU, (ThisMayAlias ? Stores : NonAliasStores), V); + + // Map this load to V. + (ThisMayAlias ? Loads : NonAliasLoads).insert(SU, V); } + if (MayAlias) + // The load is not 'NonAlias', and may therefore have + // dependencies to unanalyzable stores. + addChainDependencies(SU, Stores, UnknownValue); + } + + // Reduce maps if they grow huge. + if (Stores.size() + Loads.size() >= HugeRegion) { + DEBUG(dbgs() << "Reducing Stores and Loads maps.\n";); + reduceHugeMemNodeMaps(Stores, Loads, ReductionSize); + } + if (NonAliasStores.size() + NonAliasLoads.size() >= HugeRegion) { + DEBUG(dbgs() << "Reducing NonAliasStores and NonAliasLoads maps.\n";); + reduceHugeMemNodeMaps(NonAliasStores, NonAliasLoads, ReductionSize); } } + if (DbgMI) FirstDbgValue = DbgMI; Defs.clear(); Uses.clear(); VRegDefs.clear(); - PendingLoads.clear(); +} + +raw_ostream &llvm::operator<<(raw_ostream &OS, const PseudoSourceValue* PSV) { + PSV->printCustom(OS); + return OS; +} + +void ScheduleDAGInstrs::Value2SUsMap::dump() { + for (auto &Itr : *this) { + if (Itr.first.is()) { + const Value *V = Itr.first.get(); + if (isa(V)) + dbgs() << "Unknown"; + else + V->printAsOperand(dbgs()); + } + else if (Itr.first.is()) + dbgs() << Itr.first.get(); + else + llvm_unreachable("Unknown Value type."); + + dbgs() << " : "; + dumpSUList(Itr.second); + } +} + +/// Reduce maps in FIFO order, by N SUs. This is better than turning +/// every Nth memory SU into BarrierChain in buildSchedGraph(), since +/// it avoids unnecessary edges between seen SUs above the new +/// BarrierChain, and those below it. +void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores, + Value2SUsMap &loads, unsigned N) { + DEBUG(dbgs() << "Before reduction:\nStoring SUnits:\n"; + stores.dump(); + dbgs() << "Loading SUnits:\n"; + loads.dump()); + + // Insert all SU's NodeNums into a vector and sort it. + std::vector NodeNums; + NodeNums.reserve(stores.size() + loads.size()); + for (auto &I : stores) + for (auto *SU : I.second) + NodeNums.push_back(SU->NodeNum); + for (auto &I : loads) + for (auto *SU : I.second) + NodeNums.push_back(SU->NodeNum); + std::sort(NodeNums.begin(), NodeNums.end()); + + // The N last elements in NodeNums will be removed, and the SU with + // the lowest NodeNum of them will become the new BarrierChain to + // let the not yet seen SUs have a dependency to the removed SUs. + assert (N <= NodeNums.size()); + SUnit *newBarrierChain = &SUnits[*(NodeNums.end() - N)]; + if (BarrierChain) { + // The aliasing and non-aliasing maps reduce independently of each + // other, but share a common BarrierChain. Check if the + // newBarrierChain is above the former one. If it is not, it may + // introduce a loop to use newBarrierChain, so keep the old one. + if (newBarrierChain->NodeNum < BarrierChain->NodeNum) { + BarrierChain->addPredBarrier(newBarrierChain); + BarrierChain = newBarrierChain; + DEBUG(dbgs() << "Inserting new barrier chain: SU(" + << BarrierChain->NodeNum << ").\n";); + } + else + DEBUG(dbgs() << "Keeping old barrier chain: SU(" + << BarrierChain->NodeNum << ").\n";); + } + else + BarrierChain = newBarrierChain; + + insertBarrierChain(stores); + insertBarrierChain(loads); + + DEBUG(dbgs() << "After reduction:\nStoring SUnits:\n"; + stores.dump(); + dbgs() << "Loading SUnits:\n"; + loads.dump()); } /// \brief Initialize register live-range state for updating kills.