Index: include/llvm/CodeGen/MachineCombinerPattern.h =================================================================== --- include/llvm/CodeGen/MachineCombinerPattern.h +++ include/llvm/CodeGen/MachineCombinerPattern.h @@ -21,6 +21,12 @@ /// /// namespace MachineCombinerPattern { + +// Encode optional information into the enum value of each pattern. +enum MC_COST_CALCULATION : int { + USE_SLACK = 1 << 31 +}; + // Forward declaration enum MC_PATTERN : int { // These are commutative variants for reassociating a computation chain. See @@ -29,10 +35,11 @@ MC_REASSOC_AX_YB = 1, MC_REASSOC_XA_BY = 2, MC_REASSOC_XA_YB = 3, + LAST_REASSOC_PATTERN = MC_REASSOC_XA_YB, - /// Enumeration of instruction pattern supported by AArch64 machine combiner - MC_NONE, - MC_MULADDW_OP1, + // These AArch64 patterns allow slack in the critical path calculation to + // enable more optimization possibilities. + MC_MULADDW_OP1 = (LAST_REASSOC_PATTERN + 1) | USE_SLACK, MC_MULADDW_OP2, MC_MULSUBW_OP1, MC_MULSUBW_OP2, Index: lib/CodeGen/MachineCombiner.cpp =================================================================== --- lib/CodeGen/MachineCombiner.cpp +++ lib/CodeGen/MachineCombiner.cpp @@ -72,7 +72,7 @@ MachineTraceMetrics::Trace BlockTrace, SmallVectorImpl &InsInstrs, DenseMap &InstrIdxForVirtReg, - bool NewCodeHasLessInsts); + bool MustReduceCriticalPath, bool MustReduceDepth); bool preservesResourceLen(MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace, SmallVectorImpl &InsInstrs, @@ -210,43 +210,50 @@ return NewRootLatency; } -/// True when the new instruction sequence does not lengthen the critical path -/// and the new sequence has less instructions or the new sequence improves the -/// critical path. /// The DAGCombine code sequence ends in MI (Machine Instruction) Root. /// The new code sequence ends in MI NewRoot. A necessary condition for the new /// sequence to replace the old sequence is that it cannot lengthen the critical -/// path. This is decided by the formula: -/// (NewRootDepth + NewRootLatency) <= (RootDepth + RootLatency + RootSlack)). -/// If the new sequence has an equal length critical path but does not reduce -/// the number of instructions (NewCodeHasLessInsts is false), then it is not -/// considered an improvement. The slack is the number of cycles Root can be -/// delayed before the critical patch becomes longer. +/// path. The definition of "improve" may be restricted by specifying that the +/// new path is strictly shorter than the old path (MustReduceCriticalPath) and +/// that the new path improves the data dependency chain (MustReduceDepth). bool MachineCombiner::improvesCriticalPathLen( MachineBasicBlock *MBB, MachineInstr *Root, MachineTraceMetrics::Trace BlockTrace, SmallVectorImpl &InsInstrs, DenseMap &InstrIdxForVirtReg, - bool NewCodeHasLessInsts) { + bool MustReduceCriticalPath, bool MustReduceDepth) { assert(TSchedModel.hasInstrSchedModelOrItineraries() && "Missing machine model\n"); // NewRoot is the last instruction in the \p InsInstrs vector. - // Get depth and latency of NewRoot. unsigned NewRootIdx = InsInstrs.size() - 1; MachineInstr *NewRoot = InsInstrs[NewRootIdx]; - unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); - unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace); - // Get depth, latency and slack of Root. + // Get depth and latency of NewRoot and Root. + unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); unsigned RootDepth = BlockTrace.getInstrCycles(Root).Depth; + + DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n"; + dbgs() << " NewRootDepth: " << NewRootDepth << "\n"; + dbgs() << " RootDepth: " << RootDepth << "\n"); + + // For a transform such as reassociation, the cost equation is + // conservatively calculated so that we must improve the depth (data + // dependency cycles) in the critical path to proceed with the transform. + // Being conservative also protects against inaccuracies in the underlying + // machine trace metrics and CPU models. + if (MustReduceDepth) + return NewRootDepth < RootDepth; + + // A more flexible cost calculation for the critical path includes the slack + // of the original code sequence. This may allow the transform to proceed + // even if the instruction depths (data dependency cycles) become worse. + unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace); unsigned RootLatency = TSchedModel.computeInstrLatency(Root); unsigned RootSlack = BlockTrace.getInstrSlack(Root); - DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n"; - dbgs() << " NewRootDepth: " << NewRootDepth - << " NewRootLatency: " << NewRootLatency << "\n"; - dbgs() << " RootDepth: " << RootDepth << " RootLatency: " << RootLatency - << " RootSlack: " << RootSlack << "\n"; + DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n"; + dbgs() << " RootLatency: " << RootLatency << "\n"; + dbgs() << " RootSlack: " << RootSlack << "\n"; dbgs() << " NewRootDepth + NewRootLatency = " << NewRootDepth + NewRootLatency << "\n"; dbgs() << " RootDepth + RootLatency + RootSlack = " @@ -255,10 +262,10 @@ unsigned NewCycleCount = NewRootDepth + NewRootLatency; unsigned OldCycleCount = RootDepth + RootLatency + RootSlack; - if (NewCodeHasLessInsts) - return NewCycleCount <= OldCycleCount; - else + if (MustReduceCriticalPath) return NewCycleCount < OldCycleCount; + else + return NewCycleCount <= OldCycleCount; } /// helper routine to convert instructions into SC @@ -380,14 +387,18 @@ // in a single instruction. if (!NewInstCount) continue; + // Substitute when we optimize for codesize and the new sequence has // fewer instructions OR // the new sequence neither lengthens the critical path nor increases // resource pressure. + bool MustReduceCriticalPath = NewInstCount >= OldInstCount; + bool MustReduceDepth = !(P & MachineCombinerPattern::USE_SLACK); if (doSubstitute(NewInstCount, OldInstCount) || (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, - InstrIdxForVirtReg, - NewInstCount < OldInstCount) && + InstrIdxForVirtReg, + MustReduceCriticalPath, + MustReduceDepth) && preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) { for (auto *InstrPtr : InsInstrs) MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr); Index: test/CodeGen/X86/machine-combiner.ll =================================================================== --- test/CodeGen/X86/machine-combiner.ll +++ test/CodeGen/X86/machine-combiner.ll @@ -632,10 +632,10 @@ ; AVX-NEXT: callq bar ; AVX-NEXT: vmovsd %xmm0, (%rsp) ; AVX-NEXT: callq bar -; AVX-NEXT: vmovsd (%rsp), %xmm1 -; AVX: vaddsd 8(%rsp), %xmm1, %xmm1 +; AVX-NEXT: vmovsd 8(%rsp), %xmm1 +; AVX: vaddsd 16(%rsp), %xmm1, %xmm1 +; AVX-NEXT: vaddsd (%rsp), %xmm0, %xmm0 ; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vaddsd 16(%rsp), %xmm0, %xmm0 %x0 = call double @bar() %x1 = call double @bar() @@ -656,9 +656,10 @@ ; AVX-NEXT: callq bar ; AVX-NEXT: vmovsd %xmm0, (%rsp) ; AVX-NEXT: callq bar +; AVX-NEXT: vmovsd 8(%rsp), %xmm1 +; AVX: vaddsd 16(%rsp), %xmm1, %xmm1 ; AVX-NEXT: vaddsd (%rsp), %xmm0, %xmm0 -; AVX-NEXT: vaddsd 8(%rsp), %xmm0, %xmm0 -; AVX-NEXT: vaddsd 16(%rsp), %xmm0, %xmm0 +; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 %x0 = call double @bar() %x1 = call double @bar()