Index: include/llvm/CodeGen/MachineTraceMetrics.h =================================================================== --- include/llvm/CodeGen/MachineTraceMetrics.h +++ include/llvm/CodeGen/MachineTraceMetrics.h @@ -47,6 +47,7 @@ #ifndef LLVM_CODEGEN_MACHINETRACEMETRICS_H #define LLVM_CODEGEN_MACHINETRACEMETRICS_H +#include "llvm/ADT/SparseSet.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/None.h" @@ -68,6 +69,22 @@ class TargetInstrInfo; class TargetRegisterInfo; +// Keep track of physreg data dependencies by recording each live register unit. +// Associate each regunit with an instruction operand. Depending on the +// direction instructions are scanned, it could be the operand that defined the +// regunit, or the highest operand to read the regunit. +struct LiveRegUnit { + unsigned RegUnit; + unsigned Cycle = 0; + const MachineInstr *MI = nullptr; + unsigned Op = 0; + + unsigned getSparseSetIndex() const { return RegUnit; } + + LiveRegUnit(unsigned RU) : RegUnit(RU) {} +}; + + class MachineTraceMetrics : public MachineFunctionPass { const MachineFunction *MF = nullptr; const TargetInstrInfo *TII = nullptr; @@ -243,7 +260,7 @@ unsigned getBlockNum() const { return &TBI - &TE.BlockInfo[0]; } public: - explicit Trace(Ensemble &te, TraceBlockInfo &tbi) : TE(te), TBI(tbi) {} + explicit Trace(Ensemble &te, TraceBlockInfo &tbi) : TE(te), TBI(tbi) {} void print(raw_ostream&) const; @@ -343,6 +360,7 @@ /// Get the trace that passes through MBB. /// The trace is computed on demand. Trace getTrace(const MachineBasicBlock *MBB); + void updateDepth(const MachineBasicBlock *, const MachineInstr&, SparseSet *RegUnits); }; /// Strategies for selecting traces. @@ -367,7 +385,6 @@ /// /// Call Ensemble::getTrace() again to update any trace handles. void invalidate(const MachineBasicBlock *MBB); - private: // One entry per basic block, indexed by block number. SmallVector BlockInfo; Index: lib/CodeGen/MachineCombiner.cpp =================================================================== --- lib/CodeGen/MachineCombiner.cpp +++ lib/CodeGen/MachineCombiner.cpp @@ -35,6 +35,7 @@ STATISTIC(NumInstCombined, "Number of machineinst combined"); namespace { + class MachineCombiner : public MachineFunctionPass { const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; @@ -258,7 +259,7 @@ unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth; - DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n"; + DEBUG(dbgs() << "DEPENDENCE DATA FOR " << *Root << "\n"; dbgs() << " NewRootDepth: " << NewRootDepth << "\n"; dbgs() << " RootDepth: " << RootDepth << "\n"); @@ -270,29 +271,22 @@ if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) return NewRootDepth < RootDepth; - // A more flexible cost calculation for the critical path includes the slack - // of the original code sequence. This may allow the transform to proceed - // even if the instruction depths (data dependency cycles) become worse. - unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace); unsigned RootLatency = 0; for (auto I : DelInstrs) RootLatency += TSchedModel.computeInstrLatency(I); - unsigned RootSlack = BlockTrace.getInstrSlack(*Root); - DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n"; dbgs() << " RootLatency: " << RootLatency << "\n"; dbgs() << " RootSlack: " << RootSlack << "\n"; dbgs() << " NewRootDepth + NewRootLatency = " << NewRootDepth + NewRootLatency << "\n"; - dbgs() << " RootDepth + RootLatency + RootSlack = " - << RootDepth + RootLatency + RootSlack << "\n";); + dbgs() << " RootDepth + RootLatency = " + << RootDepth + RootLatency << "\n";); unsigned NewCycleCount = NewRootDepth + NewRootLatency; - unsigned OldCycleCount = RootDepth + RootLatency + RootSlack; - + unsigned OldCycleCount = RootDepth + RootLatency; return NewCycleCount <= OldCycleCount; } @@ -357,14 +351,21 @@ static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI, SmallVector InsInstrs, SmallVector DelInstrs, - MachineTraceMetrics *Traces) { + MachineTraceMetrics::Ensemble *MinInstr, + MachineBasicBlock::iterator &LastUpdate) { for (auto *InstrPtr : InsInstrs) - MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr); - for (auto *InstrPtr : DelInstrs) + MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr); + + for (auto *InstrPtr : DelInstrs) { + if (InstrPtr->getIterator() == LastUpdate) + LastUpdate++; InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval(); - ++NumInstCombined; - Traces->invalidate(MBB); - Traces->verifyAnalysis(); + } + + for (auto *InstrPtr : InsInstrs) + MinInstr->updateDepth(MBB, *InstrPtr, nullptr); + + NumInstCombined++; } /// Substitute a slow code sequence with a faster one by @@ -379,8 +380,11 @@ DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n"); auto BlockIter = MBB->begin(); + auto LastUpdate = MBB->begin(); // Check if the block is in a loop. const MachineLoop *ML = MLI->getLoopFor(MBB); + if (!MinInstr) + MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); while (BlockIter != MBB->end()) { auto &MI = *BlockIter++; @@ -415,13 +419,11 @@ if (!TII->getMachineCombinerPatterns(MI, Patterns)) continue; + MinInstr->updateDepth(MBB, MI, nullptr); for (auto P : Patterns) { SmallVector InsInstrs; SmallVector DelInstrs; DenseMap InstrIdxForVirtReg; - if (!MinInstr) - MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); - Traces->verifyAnalysis(); TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs, InstrIdxForVirtReg); unsigned NewInstCount = InsInstrs.size(); @@ -436,23 +438,31 @@ if (ML && TII->isThroughputPattern(P)) SubstituteAlways = true; + for (; LastUpdate != (MachineBasicBlock::iterator) &MI; LastUpdate++) + MinInstr->updateDepth(MBB, *LastUpdate, nullptr); + // Substitute when we optimize for codesize and the new sequence has // fewer instructions OR // the new sequence neither lengthens the critical path nor increases // resource pressure. if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount)) { - insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces); + insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr, LastUpdate); // Eagerly stop after the first pattern fires. Changed = true; break; } else { - // Calculating the trace metrics may be expensive, - // so only do this when necessary. + // We only compute the full trace the first time we hit this the first + // time. We do not invalidate the trace, but instead update the + // instruction depths incrementally. + // NOTE: Only the instruction depths up to MI are accurate. All other + // trace information is not updated. MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); + Traces->verifyAnalysis(); if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs, InstrIdxForVirtReg, P) && preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) { - insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces); + insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr, LastUpdate); + // Eagerly stop after the first pattern fires. Changed = true; break; @@ -467,6 +477,8 @@ } } + if (Changed) + Traces->invalidate(MBB); return Changed; } Index: lib/CodeGen/MachineTraceMetrics.cpp =================================================================== --- lib/CodeGen/MachineTraceMetrics.cpp +++ lib/CodeGen/MachineTraceMetrics.cpp @@ -694,25 +694,6 @@ } } -// Keep track of physreg data dependencies by recording each live register unit. -// Associate each regunit with an instruction operand. Depending on the -// direction instructions are scanned, it could be the operand that defined the -// regunit, or the highest operand to read the regunit. -namespace { - -struct LiveRegUnit { - unsigned RegUnit; - unsigned Cycle = 0; - const MachineInstr *MI = nullptr; - unsigned Op = 0; - - unsigned getSparseSetIndex() const { return RegUnit; } - - LiveRegUnit(unsigned RU) : RegUnit(RU) {} -}; - -} // end anonymous namespace - // Identify physreg dependencies for UseMI, and update the live regunit // tracking set when scanning instructions downwards. static void updatePhysDepsDownwards(const MachineInstr *UseMI, @@ -797,6 +778,48 @@ return MaxLen; } +void MachineTraceMetrics::Ensemble:: +updateDepth(const MachineBasicBlock *MBB, const MachineInstr &UseMI, + SparseSet *RegUnits) { + TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()]; + SmallVector Deps; + // Collect all data dependencies. + if (UseMI.isPHI()) + getPHIDeps(UseMI, Deps, TBI.Pred, MTM.MRI); + else if (getDataDeps(UseMI, Deps, MTM.MRI) && RegUnits) + updatePhysDepsDownwards(&UseMI, Deps, *RegUnits, MTM.TRI); + + // Filter and process dependencies, computing the earliest issue cycle. + unsigned Cycle = 0; + for (const DataDep &Dep : Deps) { + const TraceBlockInfo&DepTBI = + BlockInfo[Dep.DefMI->getParent()->getNumber()]; + // Ignore dependencies from outside the current trace. + if (!DepTBI.isUsefulDominator(TBI)) + continue; + assert(DepTBI.HasValidInstrDepths && "Inconsistent dependency"); + unsigned DepCycle = Cycles.lookup(Dep.DefMI).Depth; + // Add latency if DefMI is a real instruction. Transients get latency 0. + if (!Dep.DefMI->isTransient()) + DepCycle += MTM.SchedModel + .computeOperandLatency(Dep.DefMI, Dep.DefOp, &UseMI, Dep.UseOp); + Cycle = std::max(Cycle, DepCycle); + } + // Remember the instruction depth. + InstrCycles &MICycles = Cycles[&UseMI]; + MICycles.Depth = Cycle; + + if (!TBI.HasValidInstrHeights) { + DEBUG(dbgs() << Cycle << '\t' << UseMI); + return; + } + + + // Update critical path length. + TBI.CriticalPath = std::max(TBI.CriticalPath, Cycle + MICycles.Height); + DEBUG(dbgs() << TBI.CriticalPath << '\t' << Cycle << '\t' << UseMI); + +} /// Compute instruction depths for all instructions above or in MBB in its /// trace. This assumes that the trace through MBB has already been computed. void MachineTraceMetrics::Ensemble:: @@ -818,11 +841,10 @@ // in the trace. We should track any live-out physregs that were defined in // the trace. This is quite rare in SSA form, typically created by CSE // hoisting a compare. + SparseSet RegUnits; RegUnits.setUniverse(MTM.TRI->getNumRegUnits()); - // Go through trace blocks in top-down order, stopping after the center block. - SmallVector Deps; while (!Stack.empty()) { MBB = Stack.pop_back_val(); DEBUG(dbgs() << "\nDepths for BB#" << MBB->getNumber() << ":\n"); @@ -848,40 +870,7 @@ TBI.CriticalPath = computeCrossBlockCriticalPath(TBI); for (const auto &UseMI : *MBB) { - // Collect all data dependencies. - Deps.clear(); - if (UseMI.isPHI()) - getPHIDeps(UseMI, Deps, TBI.Pred, MTM.MRI); - else if (getDataDeps(UseMI, Deps, MTM.MRI)) - updatePhysDepsDownwards(&UseMI, Deps, RegUnits, MTM.TRI); - - // Filter and process dependencies, computing the earliest issue cycle. - unsigned Cycle = 0; - for (const DataDep &Dep : Deps) { - const TraceBlockInfo&DepTBI = - BlockInfo[Dep.DefMI->getParent()->getNumber()]; - // Ignore dependencies from outside the current trace. - if (!DepTBI.isUsefulDominator(TBI)) - continue; - assert(DepTBI.HasValidInstrDepths && "Inconsistent dependency"); - unsigned DepCycle = Cycles.lookup(Dep.DefMI).Depth; - // Add latency if DefMI is a real instruction. Transients get latency 0. - if (!Dep.DefMI->isTransient()) - DepCycle += MTM.SchedModel - .computeOperandLatency(Dep.DefMI, Dep.DefOp, &UseMI, Dep.UseOp); - Cycle = std::max(Cycle, DepCycle); - } - // Remember the instruction depth. - InstrCycles &MICycles = Cycles[&UseMI]; - MICycles.Depth = Cycle; - - if (!TBI.HasValidInstrHeights) { - DEBUG(dbgs() << Cycle << '\t' << UseMI); - continue; - } - // Update critical path length. - TBI.CriticalPath = std::max(TBI.CriticalPath, Cycle + MICycles.Height); - DEBUG(dbgs() << TBI.CriticalPath << '\t' << Cycle << '\t' << UseMI); + updateDepth(MBB, UseMI, &RegUnits); } } }