Index: lib/CodeGen/MachineCombiner.cpp =================================================================== --- lib/CodeGen/MachineCombiner.cpp +++ lib/CodeGen/MachineCombiner.cpp @@ -72,7 +72,7 @@ MachineTraceMetrics::Trace BlockTrace, SmallVectorImpl &InsInstrs, DenseMap &InstrIdxForVirtReg, - bool NewCodeHasLessInsts); + bool MustReduceCriticalPath, bool MustReduceDepth); bool preservesResourceLen(MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace, SmallVectorImpl &InsInstrs, @@ -210,43 +210,50 @@ return NewRootLatency; } -/// True when the new instruction sequence does not lengthen the critical path -/// and the new sequence has less instructions or the new sequence improves the -/// critical path. /// The DAGCombine code sequence ends in MI (Machine Instruction) Root. /// The new code sequence ends in MI NewRoot. A necessary condition for the new /// sequence to replace the old sequence is that it cannot lengthen the critical -/// path. This is decided by the formula: -/// (NewRootDepth + NewRootLatency) <= (RootDepth + RootLatency + RootSlack)). -/// If the new sequence has an equal length critical path but does not reduce -/// the number of instructions (NewCodeHasLessInsts is false), then it is not -/// considered an improvement. The slack is the number of cycles Root can be -/// delayed before the critical patch becomes longer. +/// path. The definition of "improve" may be restricted by specifying that the +/// new path is strictly shorter than the old path (MustReduceCriticalPath) and +/// that the new path improves the data dependency chain (MustReduceDepth). bool MachineCombiner::improvesCriticalPathLen( MachineBasicBlock *MBB, MachineInstr *Root, MachineTraceMetrics::Trace BlockTrace, SmallVectorImpl &InsInstrs, DenseMap &InstrIdxForVirtReg, - bool NewCodeHasLessInsts) { + bool MustReduceCriticalPath, bool MustReduceDepth) { assert(TSchedModel.hasInstrSchedModelOrItineraries() && "Missing machine model\n"); // NewRoot is the last instruction in the \p InsInstrs vector. - // Get depth and latency of NewRoot. unsigned NewRootIdx = InsInstrs.size() - 1; MachineInstr *NewRoot = InsInstrs[NewRootIdx]; - unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); - unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace); - // Get depth, latency and slack of Root. + // Get depth and latency of NewRoot and Root. + unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); unsigned RootDepth = BlockTrace.getInstrCycles(Root).Depth; + + DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n"; + dbgs() << " NewRootDepth: " << NewRootDepth << "\n"; + dbgs() << " RootDepth: " << RootDepth << "\n"); + + // For a transform such as reassociation, the cost equation is + // conservatively calculated so that we must improve the depth (data + // dependency cycles) in the critical path to proceed with the transform. + // Being conservative also protects against inaccuracies in the underlying + // machine trace metrics and CPU models. + if (MustReduceDepth) + return NewRootDepth < RootDepth; + + // A more flexible cost calculation for the critical path includes the slack + // of the original code sequence. This may allow the transform to proceed + // even if the instruction depths (data dependency cycles) become worse. + unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace); unsigned RootLatency = TSchedModel.computeInstrLatency(Root); unsigned RootSlack = BlockTrace.getInstrSlack(Root); - DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n"; - dbgs() << " NewRootDepth: " << NewRootDepth - << " NewRootLatency: " << NewRootLatency << "\n"; - dbgs() << " RootDepth: " << RootDepth << " RootLatency: " << RootLatency - << " RootSlack: " << RootSlack << "\n"; + DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n"; + dbgs() << " RootLatency: " << RootLatency << "\n"; + dbgs() << " RootSlack: " << RootSlack << "\n"; dbgs() << " NewRootDepth + NewRootLatency = " << NewRootDepth + NewRootLatency << "\n"; dbgs() << " RootDepth + RootLatency + RootSlack = " @@ -255,10 +262,10 @@ unsigned NewCycleCount = NewRootDepth + NewRootLatency; unsigned OldCycleCount = RootDepth + RootLatency + RootSlack; - if (NewCodeHasLessInsts) - return NewCycleCount <= OldCycleCount; - else + if (MustReduceCriticalPath) return NewCycleCount < OldCycleCount; + else + return NewCycleCount <= OldCycleCount; } /// helper routine to convert instructions into SC @@ -319,6 +326,23 @@ return false; } +/// For some patterns, we want to use a more conservative cost metric when +/// when evaluating if a transform is beneficial. These patterns require that +/// the depth calculated by MachineTraceMetrics is improved by the transform. +static bool mustReduceDepth(MachineCombinerPattern P) { + // TODO: If C++ ever gets a real enum class, make this part of the + // MachineCombinerPattern class. + switch (P) { + case MachineCombinerPattern::REASSOC_AX_BY: + case MachineCombinerPattern::REASSOC_AX_YB: + case MachineCombinerPattern::REASSOC_XA_BY: + case MachineCombinerPattern::REASSOC_XA_YB: + return true; + default: + return false; + } +} + /// Substitute a slow code sequence with a faster one by /// evaluating instruction combining pattern. /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction @@ -380,14 +404,17 @@ // in a single instruction. if (!NewInstCount) continue; + // Substitute when we optimize for codesize and the new sequence has // fewer instructions OR // the new sequence neither lengthens the critical path nor increases // resource pressure. + bool MustReduceCriticalPath = NewInstCount >= OldInstCount; if (doSubstitute(NewInstCount, OldInstCount) || (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, - InstrIdxForVirtReg, - NewInstCount < OldInstCount) && + InstrIdxForVirtReg, + MustReduceCriticalPath, + mustReduceDepth(P)) && preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) { for (auto *InstrPtr : InsInstrs) MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr); Index: test/CodeGen/X86/machine-combiner.ll =================================================================== --- test/CodeGen/X86/machine-combiner.ll +++ test/CodeGen/X86/machine-combiner.ll @@ -632,10 +632,10 @@ ; AVX-NEXT: callq bar ; AVX-NEXT: vmovsd %xmm0, (%rsp) ; AVX-NEXT: callq bar -; AVX-NEXT: vmovsd (%rsp), %xmm1 -; AVX: vaddsd 8(%rsp), %xmm1, %xmm1 +; AVX-NEXT: vmovsd 8(%rsp), %xmm1 +; AVX: vaddsd 16(%rsp), %xmm1, %xmm1 +; AVX-NEXT: vaddsd (%rsp), %xmm0, %xmm0 ; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vaddsd 16(%rsp), %xmm0, %xmm0 %x0 = call double @bar() %x1 = call double @bar() @@ -656,9 +656,10 @@ ; AVX-NEXT: callq bar ; AVX-NEXT: vmovsd %xmm0, (%rsp) ; AVX-NEXT: callq bar +; AVX-NEXT: vmovsd 8(%rsp), %xmm1 +; AVX: vaddsd 16(%rsp), %xmm1, %xmm1 ; AVX-NEXT: vaddsd (%rsp), %xmm0, %xmm0 -; AVX-NEXT: vaddsd 8(%rsp), %xmm0, %xmm0 -; AVX-NEXT: vaddsd 16(%rsp), %xmm0, %xmm0 +; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 %x0 = call double @bar() %x1 = call double @bar()