Index: lib/CodeGen/MachineCombiner.cpp =================================================================== --- lib/CodeGen/MachineCombiner.cpp +++ lib/CodeGen/MachineCombiner.cpp @@ -85,6 +85,11 @@ SmallVectorImpl &DelInstrs); void instr2instrSC(SmallVectorImpl &Instrs, SmallVectorImpl &InstrsSC); + std::pair + getLatenciesForInstrSequences(MachineInstr &MI, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + MachineTraceMetrics::Trace BlockTrace); }; } @@ -242,6 +247,29 @@ } } +/// Estimate the latency of the new and original instruction sequence by summing +/// up the latencies of the inserted and deleted instructions. This assumes +/// that the inserted and deleted instructions are dependent instruction chains, +/// which might not hold in all cases. +std::pair MachineCombiner::getLatenciesForInstrSequences( + MachineInstr &MI, SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + MachineTraceMetrics::Trace BlockTrace) { + assert(!InsInstrs.empty() && "Only support sequences that insert instrs."); + unsigned NewRootLatency = 0; + // NewRoot is the last instruction in the \p InsInstrs vector. + MachineInstr *NewRoot = InsInstrs.back(); + for (unsigned i = 0; i < InsInstrs.size() - 1; i++) + NewRootLatency += TSchedModel.computeInstrLatency(InsInstrs[i]); + NewRootLatency += getLatency(&MI, NewRoot, BlockTrace); + + unsigned RootLatency = 0; + for (auto I : DelInstrs) + RootLatency += TSchedModel.computeInstrLatency(I); + + return { NewRootLatency, RootLatency }; +} + /// The DAGCombine code sequence ends in MI (Machine Instruction) Root. /// The new code sequence ends in MI NewRoot. A necessary condition for the new /// sequence to replace the old sequence is that it cannot lengthen the critical @@ -257,10 +285,6 @@ bool SlackIsAccurate) { assert(TSchedModel.hasInstrSchedModelOrItineraries() && "Missing machine model\n"); - // NewRoot is the last instruction in the \p InsInstrs vector. - unsigned NewRootIdx = InsInstrs.size() - 1; - MachineInstr *NewRoot = InsInstrs[NewRootIdx]; - // Get depth and latency of NewRoot and Root. unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth; @@ -282,17 +306,9 @@ // even if the instruction depths (data dependency cycles) become worse. // Account for the latency of the inserted and deleted instructions by - // adding up their latencies. This assumes that the inserted and deleted - // instructions are dependent instruction chains, which might not hold - // in all cases. - unsigned NewRootLatency = 0; - for (unsigned i = 0; i < InsInstrs.size() - 1; i++) - NewRootLatency += TSchedModel.computeInstrLatency(InsInstrs[i]); - NewRootLatency += getLatency(Root, NewRoot, BlockTrace); - - unsigned RootLatency = 0; - for (auto I : DelInstrs) - RootLatency += TSchedModel.computeInstrLatency(I); + unsigned NewRootLatency, RootLatency; + std::tie(NewRootLatency, RootLatency) = + getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace); unsigned RootSlack = BlockTrace.getInstrSlack(*Root); unsigned NewCycleCount = NewRootDepth + NewRootLatency; @@ -459,11 +475,40 @@ // The algorithm does not try to evaluate all patterns and pick the best. // This is only an artificial restriction though. In practice there is // mostly one pattern, and getMachineCombinerPatterns() can order patterns - // based on an internal cost heuristic. + // based on an internal cost heuristic. If EXPENSIVE_CHECKS are enabled, + // all patterns are checked to ensure later patterns do not provide better + // latency savings. if (!TII->getMachineCombinerPatterns(MI, Patterns)) continue; +#ifdef EXPENSIVE_CHECKS + // Check that the difference between original and new latency is + // decreasing for later patterns. This helps to discover sub-optimal + // pattern orderings. + long PrevLatencyDiff = std::numeric_limits::max(); + for (auto P : Patterns) { + SmallVector InsInstrs; + SmallVector DelInstrs; + DenseMap InstrIdxForVirtReg; + TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs, + InstrIdxForVirtReg); + // Found pattern, but did not generate alternative sequence. + // This can happen e.g. when an immediate could not be materialized + // in a single instruction. + if (InsInstrs.empty() || !TSchedModel.hasInstrSchedModelOrItineraries()) + continue; + + unsigned NewRootLatency, RootLatency; + std::tie(NewRootLatency, RootLatency) = getLatenciesForInstrSequences( + MI, InsInstrs, DelInstrs, MinInstr->getTrace(MBB)); + long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency); + assert(CurrentLatencyDiff <= PrevLatencyDiff && + "Current pattern is better than previous pattern."); + PrevLatencyDiff = CurrentLatencyDiff; + } +#endif + for (auto P : Patterns) { SmallVector InsInstrs; SmallVector DelInstrs; Index: test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir =================================================================== --- test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir +++ test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir @@ -26,8 +26,8 @@ # UNPROFITABLE-NEXT: FSUBv2f32 killed %3, %2 # # PROFITABLE-LABEL: name: f1_2s -# PROFITABLE: %5:fpr64 = FNEGv2f32 %2 -# PROFITABLE-NEXT: FMLAv2f32 killed %5, %0, %1 +# PROFITABLE: [[R1:%[0-9]+]]:fpr64 = FNEGv2f32 %2 +# PROFITABLE-NEXT: FMLAv2f32 killed [[R1]], %0, %1 --- name: f1_4s registers: @@ -52,8 +52,8 @@ # UNPROFITABLE-NEXT: FSUBv4f32 killed %3, %2 # # PROFITABLE-LABEL: name: f1_4s -# PROFITABLE: %5:fpr128 = FNEGv4f32 %2 -# PROFITABLE-NEXT: FMLAv4f32 killed %5, %0, %1 +# PROFITABLE: [[R1:%[0-9]+]]:fpr128 = FNEGv4f32 %2 +# PROFITABLE-NEXT: FMLAv4f32 killed [[R1]], %0, %1 --- name: f1_2d registers: @@ -78,8 +78,8 @@ # UNPROFITABLE-NEXT: FSUBv2f64 killed %3, %2 # # PROFITABLE-LABEL: name: f1_2d -# PROFITABLE: %5:fpr128 = FNEGv2f64 %2 -# PROFITABLE-NEXT: FMLAv2f64 killed %5, %0, %1 +# PROFITABLE: [[R1:%[0-9]+]]:fpr128 = FNEGv2f64 %2 +# PROFITABLE-NEXT: FMLAv2f64 killed [[R1]], %0, %1 --- name: f1_both_fmul_2s registers: