Index: include/llvm/CodeGen/MachineCombinerPattern.h =================================================================== --- include/llvm/CodeGen/MachineCombinerPattern.h +++ include/llvm/CodeGen/MachineCombinerPattern.h @@ -21,6 +21,12 @@ /// /// namespace MachineCombinerPattern { + +// Encode optional information into the enum value of each pattern. +enum MC_COST_CALCULATION : int { + USE_SLACK = 1 << 31 +}; + // Forward declaration enum MC_PATTERN : int { // These are commutative variants for reassociating a computation chain. See @@ -29,10 +35,10 @@ MC_REASSOC_AX_YB = 1, MC_REASSOC_XA_BY = 2, MC_REASSOC_XA_YB = 3, + LAST_REASSOC_PATTERN = MC_REASSOC_XA_YB, /// Enumeration of instruction pattern supported by AArch64 machine combiner - MC_NONE, - MC_MULADDW_OP1, + MC_MULADDW_OP1 = (LAST_REASSOC_PATTERN + 1) | USE_SLACK, MC_MULADDW_OP2, MC_MULSUBW_OP1, MC_MULSUBW_OP2, Index: lib/CodeGen/MachineCombiner.cpp =================================================================== --- lib/CodeGen/MachineCombiner.cpp +++ lib/CodeGen/MachineCombiner.cpp @@ -71,7 +71,7 @@ MachineTraceMetrics::Trace BlockTrace, SmallVectorImpl &InsInstrs, DenseMap &InstrIdxForVirtReg, - bool NewCodeHasLessInsts); + bool NewCodeHasLessInsts, bool UseSlack); bool preservesResourceLen(MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace, SmallVectorImpl &InsInstrs, @@ -222,13 +222,15 @@ /// If the new sequence has an equal length critical path but does not reduce /// the number of instructions (NewCodeHasLessInsts is false), then it is not /// considered an improvement. The slack is the number of cycles Root can be -/// delayed before the critical patch becomes longer. +/// delayed before the critical patch becomes longer. Slack may optionally be +/// excluded from the calculation to provide a more conservative estimate of the +/// original critical path length. bool MachineCombiner::improvesCriticalPathLen( MachineBasicBlock *MBB, MachineInstr *Root, MachineTraceMetrics::Trace BlockTrace, SmallVectorImpl &InsInstrs, DenseMap &InstrIdxForVirtReg, - bool NewCodeHasLessInsts) { + bool NewCodeHasLessInsts, bool UseSlack) { assert(TSchedModel.hasInstrSchedModelOrItineraries() && "Missing machine model\n"); @@ -242,7 +244,7 @@ // Get depth, latency and slack of Root. unsigned RootDepth = BlockTrace.getInstrCycles(Root).Depth; unsigned RootLatency = TSchedModel.computeInstrLatency(Root); - unsigned RootSlack = BlockTrace.getInstrSlack(Root); + unsigned RootSlack = UseSlack ? BlockTrace.getInstrSlack(Root) : 0; DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n"; dbgs() << " NewRootDepth: " << NewRootDepth @@ -387,8 +389,9 @@ // resource pressure. if (doSubstitute(NewInstCount, OldInstCount) || (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, - InstrIdxForVirtReg, - NewInstCount < OldInstCount) && + InstrIdxForVirtReg, + NewInstCount < OldInstCount, + P & MachineCombinerPattern::USE_SLACK) && preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) { for (auto *InstrPtr : InsInstrs) MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr); Index: test/CodeGen/X86/machine-combiner.ll =================================================================== --- test/CodeGen/X86/machine-combiner.ll +++ test/CodeGen/X86/machine-combiner.ll @@ -632,10 +632,10 @@ ; AVX-NEXT: callq bar ; AVX-NEXT: vmovsd %xmm0, (%rsp) ; AVX-NEXT: callq bar -; AVX-NEXT: vmovsd (%rsp), %xmm1 -; AVX: vaddsd 8(%rsp), %xmm1, %xmm1 +; AVX-NEXT: vmovsd 8(%rsp), %xmm1 +; AVX: vaddsd 16(%rsp), %xmm1, %xmm1 +; AVX-NEXT: vaddsd (%rsp), %xmm0, %xmm0 ; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vaddsd 16(%rsp), %xmm0, %xmm0 %x0 = call double @bar() %x1 = call double @bar() @@ -656,9 +656,10 @@ ; AVX-NEXT: callq bar ; AVX-NEXT: vmovsd %xmm0, (%rsp) ; AVX-NEXT: callq bar +; AVX-NEXT: vmovsd 8(%rsp), %xmm1 +; AVX: vaddsd 16(%rsp), %xmm1, %xmm1 ; AVX-NEXT: vaddsd (%rsp), %xmm0, %xmm0 -; AVX-NEXT: vaddsd 8(%rsp), %xmm0, %xmm0 -; AVX-NEXT: vaddsd 16(%rsp), %xmm0, %xmm0 +; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 %x0 = call double @bar() %x1 = call double @bar()