Index: lib/CodeGen/MachineCombiner.cpp =================================================================== --- lib/CodeGen/MachineCombiner.cpp +++ lib/CodeGen/MachineCombiner.cpp @@ -39,6 +39,20 @@ cl::desc("Incremental depth computation will be used for basic " "blocks with more instructions."), cl::init(500)); +#ifdef EXPENSIVE_CHECKS +static cl::opt VerifyPatternOrder( + "machine-combiner-verify-pattern-order", cl::Hidden, + cl::desc( + "Verify that the generated patterns are ordered by increasing latency"), + cl::init(true)); +#else +static cl::opt VerifyPatternOrder( + "machine-combiner-verify-pattern-order", cl::Hidden, + cl::desc( + "Verify that the generated patterns are ordered by increasing latency"), + cl::init(false)); +#endif + namespace { class MachineCombiner : public MachineFunctionPass { const TargetInstrInfo *TII; @@ -85,6 +99,14 @@ SmallVectorImpl &DelInstrs); void instr2instrSC(SmallVectorImpl &Instrs, SmallVectorImpl &InstrsSC); + std::pair + getLatenciesForInstrSequences(MachineInstr &MI, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + MachineTraceMetrics::Trace BlockTrace); + + void verifyPatternOrder(MachineBasicBlock *MBB, MachineInstr &Root, + SmallVector &Patterns); }; } @@ -242,6 +264,29 @@ } } +/// Estimate the latency of the new and original instruction sequence by summing +/// up the latencies of the inserted and deleted instructions. This assumes +/// that the inserted and deleted instructions are dependent instruction chains, +/// which might not hold in all cases. +std::pair MachineCombiner::getLatenciesForInstrSequences( + MachineInstr &MI, SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + MachineTraceMetrics::Trace BlockTrace) { + assert(!InsInstrs.empty() && "Only support sequences that insert instrs."); + unsigned NewRootLatency = 0; + // NewRoot is the last instruction in the \p InsInstrs vector. + MachineInstr *NewRoot = InsInstrs.back(); + for (unsigned i = 0; i < InsInstrs.size() - 1; i++) + NewRootLatency += TSchedModel.computeInstrLatency(InsInstrs[i]); + NewRootLatency += getLatency(&MI, NewRoot, BlockTrace); + + unsigned RootLatency = 0; + for (auto I : DelInstrs) + RootLatency += TSchedModel.computeInstrLatency(I); + + return {NewRootLatency, RootLatency}; +} + /// The DAGCombine code sequence ends in MI (Machine Instruction) Root. /// The new code sequence ends in MI NewRoot. A necessary condition for the new /// sequence to replace the old sequence is that it cannot lengthen the critical @@ -257,10 +302,6 @@ bool SlackIsAccurate) { assert(TSchedModel.hasInstrSchedModelOrItineraries() && "Missing machine model\n"); - // NewRoot is the last instruction in the \p InsInstrs vector. - unsigned NewRootIdx = InsInstrs.size() - 1; - MachineInstr *NewRoot = InsInstrs[NewRootIdx]; - // Get depth and latency of NewRoot and Root. unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth; @@ -282,17 +323,9 @@ // even if the instruction depths (data dependency cycles) become worse. // Account for the latency of the inserted and deleted instructions by - // adding up their latencies. This assumes that the inserted and deleted - // instructions are dependent instruction chains, which might not hold - // in all cases. - unsigned NewRootLatency = 0; - for (unsigned i = 0; i < InsInstrs.size() - 1; i++) - NewRootLatency += TSchedModel.computeInstrLatency(InsInstrs[i]); - NewRootLatency += getLatency(Root, NewRoot, BlockTrace); - - unsigned RootLatency = 0; - for (auto I : DelInstrs) - RootLatency += TSchedModel.computeInstrLatency(I); + unsigned NewRootLatency, RootLatency; + std::tie(NewRootLatency, RootLatency) = + getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace); unsigned RootSlack = BlockTrace.getInstrSlack(*Root); unsigned NewCycleCount = NewRootDepth + NewRootLatency; @@ -409,6 +442,34 @@ NumInstCombined++; } +// Check that the difference between original and new latency is decreasing for +// later patterns. This helps to discover sub-optimal pattern orderings. +void MachineCombiner::verifyPatternOrder( + MachineBasicBlock *MBB, MachineInstr &Root, + SmallVector &Patterns) { + long PrevLatencyDiff = std::numeric_limits::max(); + for (auto P : Patterns) { + SmallVector InsInstrs; + SmallVector DelInstrs; + DenseMap InstrIdxForVirtReg; + TII->genAlternativeCodeSequence(Root, P, InsInstrs, DelInstrs, + InstrIdxForVirtReg); + // Found pattern, but did not generate alternative sequence. + // This can happen e.g. when an immediate could not be materialized + // in a single instruction. + if (InsInstrs.empty() || !TSchedModel.hasInstrSchedModelOrItineraries()) + continue; + + unsigned NewRootLatency, RootLatency; + std::tie(NewRootLatency, RootLatency) = getLatenciesForInstrSequences( + Root, InsInstrs, DelInstrs, MinInstr->getTrace(MBB)); + long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency); + assert(CurrentLatencyDiff <= PrevLatencyDiff && + "Current pattern is better than previous pattern."); + PrevLatencyDiff = CurrentLatencyDiff; + } +} + /// Substitute a slow code sequence with a faster one by /// evaluating instruction combining pattern. /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction @@ -459,11 +520,16 @@ // The algorithm does not try to evaluate all patterns and pick the best. // This is only an artificial restriction though. In practice there is // mostly one pattern, and getMachineCombinerPatterns() can order patterns - // based on an internal cost heuristic. + // based on an internal cost heuristic. If + // machine-combiner-verify-pattern-order is enabled, all patterns are + // checked to ensure later patterns do not provide better latency savings. if (!TII->getMachineCombinerPatterns(MI, Patterns)) continue; + if (VerifyPatternOrder) + verifyPatternOrder(MBB, MI, Patterns); + for (auto P : Patterns) { SmallVector InsInstrs; SmallVector DelInstrs; Index: test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir =================================================================== --- test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir +++ test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir @@ -1,7 +1,7 @@ -# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=cortex-a57 -enable-unsafe-fp-math %s | FileCheck --check-prefixes=UNPROFITABLE,ALL %s -# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=falkor -enable-unsafe-fp-math %s | FileCheck --check-prefixes=PROFITABLE,ALL %s -# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=exynos-m1 -enable-unsafe-fp-math %s | FileCheck --check-prefixes=PROFITABLE,ALL %s -# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=thunderx2t99 -enable-unsafe-fp-math %s | FileCheck --check-prefixes=PROFITABLE,ALL %s +# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=cortex-a57 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=UNPROFITABLE,ALL %s +# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=falkor -enable-unsafe-fp-math %s -machine-combiner-verify-pattern-order=true | FileCheck --check-prefixes=PROFITABLE,ALL %s +# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=exynos-m1 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=PROFITABLE,ALL %s +# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=thunderx2t99 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=PROFITABLE,ALL %s # name: f1_2s registers: @@ -26,8 +26,8 @@ # UNPROFITABLE-NEXT: FSUBv2f32 killed %3, %2 # # PROFITABLE-LABEL: name: f1_2s -# PROFITABLE: %5:fpr64 = FNEGv2f32 %2 -# PROFITABLE-NEXT: FMLAv2f32 killed %5, %0, %1 +# PROFITABLE: [[R1:%[0-9]+]]:fpr64 = FNEGv2f32 %2 +# PROFITABLE-NEXT: FMLAv2f32 killed [[R1]], %0, %1 --- name: f1_4s registers: @@ -52,8 +52,8 @@ # UNPROFITABLE-NEXT: FSUBv4f32 killed %3, %2 # # PROFITABLE-LABEL: name: f1_4s -# PROFITABLE: %5:fpr128 = FNEGv4f32 %2 -# PROFITABLE-NEXT: FMLAv4f32 killed %5, %0, %1 +# PROFITABLE: [[R1:%[0-9]+]]:fpr128 = FNEGv4f32 %2 +# PROFITABLE-NEXT: FMLAv4f32 killed [[R1]], %0, %1 --- name: f1_2d registers: @@ -78,8 +78,8 @@ # UNPROFITABLE-NEXT: FSUBv2f64 killed %3, %2 # # PROFITABLE-LABEL: name: f1_2d -# PROFITABLE: %5:fpr128 = FNEGv2f64 %2 -# PROFITABLE-NEXT: FMLAv2f64 killed %5, %0, %1 +# PROFITABLE: [[R1:%[0-9]+]]:fpr128 = FNEGv2f64 %2 +# PROFITABLE-NEXT: FMLAv2f64 killed [[R1]], %0, %1 --- name: f1_both_fmul_2s registers: Index: test/CodeGen/AArch64/machine-combiner.ll =================================================================== --- test/CodeGen/AArch64/machine-combiner.ll +++ test/CodeGen/AArch64/machine-combiner.ll @@ -3,7 +3,7 @@ ; Incremental updates of the instruction depths should be enough for this test ; case. ; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=cortex-a57 -enable-unsafe-fp-math \ -; RUN: -disable-post-ra -machine-combiner-inc-threshold=0 < %s | FileCheck %s +; RUN: -disable-post-ra -machine-combiner-inc-threshold=0 -machine-combiner-verify-pattern-order=true < %s | FileCheck %s ; Verify that the first two adds are independent regardless of how the inputs are ; commuted. The destination registers are used as source registers for the third add. Index: test/CodeGen/AArch64/machine-combiner.mir =================================================================== --- test/CodeGen/AArch64/machine-combiner.mir +++ test/CodeGen/AArch64/machine-combiner.mir @@ -1,6 +1,6 @@ # RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a57 -enable-unsafe-fp-math \ # RUN: -run-pass machine-combiner -machine-combiner-inc-threshold=0 \ -# RUN: -verify-machineinstrs -o - %s | FileCheck %s +# RUN: -machine-combiner-verify-pattern-order=true -verify-machineinstrs -o - %s | FileCheck %s --- # Test incremental depth updates succeed when triggered after the removal of # the first instruction in a basic block. Index: test/CodeGen/X86/machine-combiner-int-vec.ll =================================================================== --- test/CodeGen/X86/machine-combiner-int-vec.ll +++ test/CodeGen/X86/machine-combiner-int-vec.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse2 < %s | FileCheck %s --check-prefix=SSE -; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx2 < %s | FileCheck %s --check-prefix=AVX +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse2 -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefix=SSE +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx2 -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefix=AVX ; Verify that 128-bit vector logical ops are reassociated. Index: test/CodeGen/X86/machine-combiner-int.ll =================================================================== --- test/CodeGen/X86/machine-combiner-int.ll +++ test/CodeGen/X86/machine-combiner-int.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -stop-after machine-combiner -o - | FileCheck %s --check-prefix=DEAD +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -machine-combiner-verify-pattern-order=true | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -stop-after machine-combiner -machine-combiner-verify-pattern-order=true -o - | FileCheck %s --check-prefix=DEAD ; Verify that integer multiplies are reassociated. The first multiply in ; each test should be independent of the result of the preceding add (lea). Index: test/CodeGen/X86/machine-combiner.ll =================================================================== --- test/CodeGen/X86/machine-combiner.ll +++ test/CodeGen/X86/machine-combiner.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=SSE -; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=AVX +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefix=SSE +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefix=AVX ; Incremental updates of the instruction depths should be enough for this test ; case.