Index: lib/CodeGen/MachineCombiner.cpp =================================================================== --- lib/CodeGen/MachineCombiner.cpp +++ lib/CodeGen/MachineCombiner.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/MachineTraceMetrics.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" @@ -34,7 +35,13 @@ STATISTIC(NumInstCombined, "Number of machineinst combined"); +static cl::opt +inc_threshold("machine-combiner-inc-threshold", cl::Hidden, + cl::desc("Incremental depth computation will be used for basic " + "blocks with more instructions."), cl::init(500)); + namespace { + class MachineCombiner : public MachineFunctionPass { const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; @@ -73,7 +80,7 @@ SmallVectorImpl &InsInstrs, SmallVectorImpl &DelInstrs, DenseMap &InstrIdxForVirtReg, - MachineCombinerPattern Pattern); + MachineCombinerPattern Pattern, bool SlackIsAccurate); bool preservesResourceLen(MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace, SmallVectorImpl &InsInstrs, @@ -247,7 +254,8 @@ SmallVectorImpl &InsInstrs, SmallVectorImpl &DelInstrs, DenseMap &InstrIdxForVirtReg, - MachineCombinerPattern Pattern) { + MachineCombinerPattern Pattern, + bool SlackIsAccurate) { assert(TSchedModel.hasInstrSchedModelOrItineraries() && "Missing machine model\n"); // NewRoot is the last instruction in the \p InsInstrs vector. @@ -258,7 +266,7 @@ unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth; - DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n"; + DEBUG(dbgs() << "DEPENDENCE DATA FOR " << *Root << "\n"; dbgs() << " NewRootDepth: " << NewRootDepth << "\n"; dbgs() << " RootDepth: " << RootDepth << "\n"); @@ -270,28 +278,30 @@ if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) return NewRootDepth < RootDepth; - // A more flexible cost calculation for the critical path includes the slack - // of the original code sequence. This may allow the transform to proceed - // even if the instruction depths (data dependency cycles) become worse. - unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace); unsigned RootLatency = 0; for (auto I : DelInstrs) RootLatency += TSchedModel.computeInstrLatency(I); - unsigned RootSlack = BlockTrace.getInstrSlack(*Root); + unsigned RootSlack = BlockTrace.getInstrSlack(*Root); + unsigned NewCycleCount = NewRootDepth + NewRootLatency; + unsigned OldCycleCount = RootDepth + RootLatency + + (SlackIsAccurate ? RootSlack : 0); DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n"; dbgs() << " RootLatency: " << RootLatency << "\n"; - dbgs() << " RootSlack: " << RootSlack << "\n"; + if (SlackIsAccurate) + dbgs() << " RootSlack: " << RootSlack << "\n"; dbgs() << " NewRootDepth + NewRootLatency = " - << NewRootDepth + NewRootLatency << "\n"; - dbgs() << " RootDepth + RootLatency + RootSlack = " - << RootDepth + RootLatency + RootSlack << "\n";); - - unsigned NewCycleCount = NewRootDepth + NewRootLatency; - unsigned OldCycleCount = RootDepth + RootLatency + RootSlack; + << NewCycleCount << "\n"; + if (SlackIsAccurate) + dbgs() << " RootDepth + RootLatency + RootSlack = " + << OldCycleCount << "\n"; + else + dbgs() << " RootDepth + RootLatency = " + << OldCycleCount << "\n"; + ); return NewCycleCount <= OldCycleCount; } @@ -357,14 +367,40 @@ static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI, SmallVector InsInstrs, SmallVector DelInstrs, - MachineTraceMetrics *Traces) { + MachineTraceMetrics::Ensemble *MinInstr, + MachineBasicBlock::iterator &LastUpdate, + SparseSet &RegUnits, + bool IncrementalUpdate) { + if (IncrementalUpdate) { + // Update depths and RegUnits since last update. We do this before removing + // instructions, which will invalidate the iterator up to MI. + auto End = ++MI.getIterator(); + for (; LastUpdate != End; LastUpdate++) { + MinInstr->updateDepth(MBB, *LastUpdate, RegUnits); + } + } + for (auto *InstrPtr : InsInstrs) - MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr); - for (auto *InstrPtr : DelInstrs) + MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr); + + for (auto *InstrPtr : DelInstrs) { + if (&*LastUpdate == InstrPtr) + LastUpdate++; InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval(); - ++NumInstCombined; - Traces->invalidate(MBB); - Traces->verifyAnalysis(); + // Erase all LiveRegs defined by the removed instruction + for (auto I = RegUnits.begin(); I != RegUnits.end(); ) { + if (I->MI == InstrPtr) + I = RegUnits.erase(I); + else + I++; + } + } + + if (IncrementalUpdate) + for (auto *InstrPtr : InsInstrs) + MinInstr->updateDepth(MBB, *InstrPtr, RegUnits); + + NumInstCombined++; } /// Substitute a slow code sequence with a faster one by @@ -376,11 +412,16 @@ /// sequence is shorter. bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { bool Changed = false; - DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n"); - + bool IncrementalUpdate = false; auto BlockIter = MBB->begin(); + auto LastUpdate = MBB->begin(); // Check if the block is in a loop. const MachineLoop *ML = MLI->getLoopFor(MBB); + if (!MinInstr) + MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); + + SparseSet RegUnits; + RegUnits.setUniverse(TRI->getNumRegUnits()); while (BlockIter != MBB->end()) { auto &MI = *BlockIter++; @@ -411,7 +452,6 @@ // This is only an artificial restriction though. In practice there is // mostly one pattern, and getMachineCombinerPatterns() can order patterns // based on an internal cost heuristic. - if (!TII->getMachineCombinerPatterns(MI, Patterns)) continue; @@ -419,9 +459,6 @@ SmallVector InsInstrs; SmallVector DelInstrs; DenseMap InstrIdxForVirtReg; - if (!MinInstr) - MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); - Traces->verifyAnalysis(); TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs, InstrIdxForVirtReg); unsigned NewInstCount = InsInstrs.size(); @@ -441,18 +478,36 @@ // the new sequence neither lengthens the critical path nor increases // resource pressure. if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount)) { - insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces); + insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr, + LastUpdate, RegUnits, IncrementalUpdate); // Eagerly stop after the first pattern fires. Changed = true; break; } else { - // Calculating the trace metrics may be expensive, - // so only do this when necessary. + // For big basic blocks, Wwe only compute the full trace the first time + // we hit this the first time. We do not invalidate the trace, but + // instead update the instruction depths incrementally. + // NOTE: Only the instruction depths up to MI are relatively accurate. + // All other trace information is not updated. + if (IncrementalUpdate) { + // Update depths since the last incremental update. + for (; LastUpdate != BlockIter; LastUpdate++) + MinInstr->updateDepth(MBB, *LastUpdate, RegUnits); + } MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); + Traces->verifyAnalysis(); if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs, - InstrIdxForVirtReg, P) && + InstrIdxForVirtReg, P, + !IncrementalUpdate) && preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) { - insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces); + insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr, + LastUpdate, RegUnits, IncrementalUpdate); + if (MBB->size() > inc_threshold) + // Use incremental depth updates for basic blocks above treshold + IncrementalUpdate = true; + else + MinInstr->invalidate(MBB); + // Eagerly stop after the first pattern fires. Changed = true; break; @@ -467,6 +522,8 @@ } } + if (Changed) + Traces->invalidate(MBB); return Changed; } Index: test/CodeGen/AArch64/machine-combiner.ll =================================================================== --- test/CodeGen/AArch64/machine-combiner.ll +++ test/CodeGen/AArch64/machine-combiner.ll @@ -1,5 +1,10 @@ ; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=cortex-a57 -enable-unsafe-fp-math -disable-post-ra < %s | FileCheck %s +; Incremental updates of the instruction depths should be enough for this test +; case. +; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=cortex-a57 -enable-unsafe-fp-math \ +; RUN: -disable-post-ra -machine-combiner-inc-threshold=0 < %s | FileCheck %s + ; Verify that the first two adds are independent regardless of how the inputs are ; commuted. The destination registers are used as source registers for the third add. Index: test/CodeGen/X86/machine-combiner.ll =================================================================== --- test/CodeGen/X86/machine-combiner.ll +++ test/CodeGen/X86/machine-combiner.ll @@ -1,6 +1,11 @@ ; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=SSE ; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=AVX +; Incremental updates of the instruction depths should be enough for this test +; case. +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefix=SSE +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefix=AVX + ; Verify that the first two adds are independent regardless of how the inputs are ; commuted. The destination registers are used as source registers for the third add. Index: test/CodeGen/X86/mul-constant-result.ll =================================================================== --- test/CodeGen/X86/mul-constant-result.ll +++ test/CodeGen/X86/mul-constant-result.ll @@ -1,6 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86 -; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=haswell| FileCheck %s --check-prefix=X64-HSW + +; Incremental updates of the instruction depths should be enough for this test +; case. +; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=haswell -machine-combiner-inc-threshold=0| FileCheck %s --check-prefix=X64-HSW ; Function Attrs: norecurse nounwind readnone uwtable define i32 @mult(i32, i32) local_unnamed_addr #0 {