Index: test/tools/llvm-exegesis/X86/analysis-latency-instruction-chaining.test =================================================================== --- /dev/null +++ test/tools/llvm-exegesis/X86/analysis-latency-instruction-chaining.test @@ -0,0 +1,59 @@ +# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-clusters-output-file=- -analysis-clustering-epsilon=0.5 -analysis-inconsistency-epsilon=0.5 -analysis-numpoints=1 | FileCheck -check-prefixes=CHECK-ALL %s + +# CHECK-ALL: {{^}}cluster_id,opcode_name,config,sched_class,latency{{$}} + +# CHECK-NEXT: {{^}}0, +# CHECK-SAME: ,1.00{{$}} +# CHECK-NEXT: {{^}}0, +# CHECK-SAME: ,1.00{{$}} + +# Instructions were executed serially, meaning that the next instruction +# *ONLY* starts executing when the current instruction finishes. +# Thus, the real latency of the first instruction is the per_snippet_value minus +# the sum of latencies of all the other instructions in the snippet. + +# RCR8rCL has latency of 11. (the value from scheduling profile!) +# Latency of whole snipped is 12 or 23. (not measured, hand-written.) +# Thus, latency of BT32rr is 12-11 = 1, or 23-11-11 = 1 + +--- +mode: latency +key: + instructions: + - 'BT32rr R11D R11D' + - 'RCR8rCL R11B R11B' + config: '' + register_initial_values: + - 'R11D=0x0' + - 'R11B=0x0' + - 'CL=0x0' +cpu_name: bdver2 +llvm_triple: x86_64-unknown-linux-gnu +num_repetitions: 10000 +measurements: + - { key: latency, value: 0.0000, per_snippet_value: 12.0000 } +error: '' +info: Repeating two instructions +assembled_snippet: 41BB0000000041B300B100450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DBC3 +... +--- +mode: latency +key: + instructions: + - 'BT32rr R11D R11D' + - 'RCR8rCL R11B R11B' + - 'RCR8rCL R11B R11B' + config: '' + register_initial_values: + - 'R11D=0x0' + - 'R11B=0x0' + - 'CL=0x0' +cpu_name: bdver2 +llvm_triple: x86_64-unknown-linux-gnu +num_repetitions: 10000 +measurements: + - { key: latency, value: 99.0000, per_snippet_value: 23.0000 } +error: '' +info: Repeating two instructions +assembled_snippet: 41BB0000000041B300B100450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DBC3 +... Index: tools/llvm-exegesis/lib/Analysis.h =================================================================== --- tools/llvm-exegesis/lib/Analysis.h +++ tools/llvm-exegesis/lib/Analysis.h @@ -37,6 +37,7 @@ class Analysis { public: Analysis(const llvm::Target &Target, + std::unique_ptr SubtargetInfo, std::unique_ptr InstrInfo, const InstructionBenchmarkClustering &Clustering, double AnalysisInconsistencyEpsilon, Index: tools/llvm-exegesis/lib/Analysis.cpp =================================================================== --- tools/llvm-exegesis/lib/Analysis.cpp +++ tools/llvm-exegesis/lib/Analysis.cpp @@ -158,11 +158,13 @@ } Analysis::Analysis(const llvm::Target &Target, + std::unique_ptr SubtargetInfo, std::unique_ptr InstrInfo, const InstructionBenchmarkClustering &Clustering, double AnalysisInconsistencyEpsilon, bool AnalysisDisplayUnstableOpcodes) - : Clustering_(Clustering), InstrInfo_(std::move(InstrInfo)), + : Clustering_(Clustering), SubtargetInfo_(std::move(SubtargetInfo)), + InstrInfo_(std::move(InstrInfo)), AnalysisInconsistencyEpsilonSquared_(AnalysisInconsistencyEpsilon * AnalysisInconsistencyEpsilon), AnalysisDisplayUnstableOpcodes_(AnalysisDisplayUnstableOpcodes) { @@ -172,8 +174,6 @@ const InstructionBenchmark &FirstPoint = Clustering.getPoints().front(); RegInfo_.reset(Target.createMCRegInfo(FirstPoint.LLVMTriple)); AsmInfo_.reset(Target.createMCAsmInfo(*RegInfo_, FirstPoint.LLVMTriple)); - SubtargetInfo_.reset(Target.createMCSubtargetInfo(FirstPoint.LLVMTriple, - FirstPoint.CpuName, "")); InstPrinter_.reset(Target.createMCInstPrinter( llvm::Triple(FirstPoint.LLVMTriple), 0 /*default variant*/, *AsmInfo_, *InstrInfo_, *RegInfo_)); Index: tools/llvm-exegesis/lib/CMakeLists.txt =================================================================== --- tools/llvm-exegesis/lib/CMakeLists.txt +++ tools/llvm-exegesis/lib/CMakeLists.txt @@ -27,6 +27,7 @@ LlvmState.cpp MCInstrDescView.cpp PerfHelper.cpp + PostProcessing.cpp RegisterAliasing.cpp RegisterValue.cpp SchedClassResolution.cpp Index: tools/llvm-exegesis/lib/PostProcessing.h =================================================================== --- /dev/null +++ tools/llvm-exegesis/lib/PostProcessing.h @@ -0,0 +1,33 @@ +//===-- PostProcessing.h ----------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Post-processing for the benchmark points. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_EXEGESIS_POSTPROCESSING_H +#define LLVM_TOOLS_LLVM_EXEGESIS_POSTPROCESSING_H + +#include "BenchmarkResult.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" + +namespace llvm { +namespace exegesis { + +void PostProcessBenchmarkPoints( + const llvm::MCSubtargetInfo &SubtargetInfo, + const llvm::MCInstrInfo &InstrInfo, + llvm::MutableArrayRef Points); + +} // namespace exegesis +} // namespace llvm + +#endif // LLVM_TOOLS_LLVM_EXEGESIS_POSTPROCESSING_H Index: tools/llvm-exegesis/lib/PostProcessing.cpp =================================================================== --- /dev/null +++ tools/llvm-exegesis/lib/PostProcessing.cpp @@ -0,0 +1,87 @@ +//===-- PostProcessing.cpp --------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "PostProcessing.h" +#include "Clustering.h" +#include "SchedClassResolution.h" +#include "llvm/ADT/STLExtras.h" +#include + +namespace llvm { +namespace exegesis { + +static std::vector +GetSchedDataAsPoint(const llvm::MCSubtargetInfo &SubtargetInfo, + const llvm::MCInstrInfo &InstrInfo, + const llvm::MCInst &Instr, + const InstructionBenchmark &Point, + const SchedClassClusterCentroid &Centroid) { + // 1. Resolve sched class id of the instruction + std::pair ID = + ResolvedSchedClass::resolveSchedClassId(SubtargetInfo, InstrInfo, Instr); + + // 2. Produce ResolvedSchedClass for the resolved sched class id. + ResolvedSchedClass RSC(SubtargetInfo, ID.first, ID.second); + + // 3. Convert ResolvedSchedClass into a 'benchmark point'. + // We need Centroid only for the Keys though. + return RSC.getAsPoint(Point.Mode, SubtargetInfo, Centroid.getStats()); +} + +static void PostProcessPoint(const llvm::MCSubtargetInfo &SubtargetInfo, + const llvm::MCInstrInfo &InstrInfo, + InstructionBenchmark &Point) { + assert(Point.Key.Instructions.size() > 1 && "Should have more than 1 instr."); + + // 1. Produce a centroid out of the measured values. + // We only need it for the Keys and validatation though. + SchedClassClusterCentroid Centroid; + Centroid.addPoint(Point.Measurements); + if (!Centroid.validate(Point.Mode)) // Ignore error points. + return; + + // 2. Replace invalid per-instr value with valid per-snippet value. + // The benchmarking code blindly divided per-snippet value by the instr count. + llvm::for_each(Point.Measurements, [](BenchmarkMeasure &Measure) { + Measure.PerInstructionValue = Measure.PerSnippetValue; + }); + + // 3. And finally, subtract the SchedClass-specified values of the extra + // instructions from the measured values, thus leaving only the value + // that actually belongs to the first instruction. + for (const llvm::MCInst &Instr : + ArrayRef(Point.Key.Instructions).drop_front()) { + std::vector Measures = + GetSchedDataAsPoint(SubtargetInfo, InstrInfo, Instr, Point, Centroid); + if (Measures.empty()) // Ignore malformed benchmarks. This won't cause + return; // corruptions because if this fails it will fail the first time. + assert(Point.Measurements.size() == Measures.size() && + "Expected dimensions for measured and computed values to match."); + for (const auto &I : llvm::zip(Point.Measurements, Measures)) + std::get<0>(I).PerInstructionValue -= std::get<1>(I).PerInstructionValue; + } +} + +static bool ShouldPostProcess(InstructionBenchmark &Point) { + // If the benchmark contains more than one instruction, then we will want to + // post-process the measurements to remove the noise from those extra instrs. + return Point.Mode == InstructionBenchmark::ModeE::Latency && + Point.Key.Instructions.size() > 1 && !Point.Measurements.empty(); +} + +void PostProcessBenchmarkPoints( + const llvm::MCSubtargetInfo &SubtargetInfo, + const llvm::MCInstrInfo &InstrInfo, + llvm::MutableArrayRef Points) { + for (InstructionBenchmark &Point : + llvm::make_filter_range(Points, ShouldPostProcess)) + PostProcessPoint(SubtargetInfo, InstrInfo, Point); +} + +} // namespace exegesis +} // namespace llvm Index: tools/llvm-exegesis/llvm-exegesis.cpp =================================================================== --- tools/llvm-exegesis/llvm-exegesis.cpp +++ tools/llvm-exegesis/llvm-exegesis.cpp @@ -17,6 +17,7 @@ #include "lib/Clustering.h" #include "lib/LlvmState.h" #include "lib/PerfHelper.h" +#include "lib/PostProcessing.h" #include "lib/Target.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Twine.h" @@ -449,7 +450,7 @@ llvm::InitializeNativeTargetDisassembler(); // Read benchmarks. const LLVMState State(""); - const std::vector Points = + std::vector Points = ExitOnErr(InstructionBenchmark::readYamls(State, BenchmarkFile)); llvm::outs() << "Parsed " << Points.size() << " benchmark points\n"; if (Points.empty()) { @@ -467,15 +468,20 @@ return; } + std::unique_ptr SubtargetInfo( + TheTarget->createMCSubtargetInfo(Points[0].LLVMTriple, Points[0].CpuName, + "")); std::unique_ptr InstrInfo(TheTarget->createMCInstrInfo()); + PostProcessBenchmarkPoints(*SubtargetInfo, *InstrInfo, Points); + const auto Clustering = ExitOnErr(InstructionBenchmarkClustering::create( Points, AnalysisClusteringAlgorithm, AnalysisDbscanNumPoints, AnalysisClusteringEpsilon, InstrInfo->getNumOpcodes())); - const Analysis Analyzer(*TheTarget, std::move(InstrInfo), Clustering, - AnalysisInconsistencyEpsilon, - AnalysisDisplayUnstableOpcodes); + const Analysis Analyzer( + *TheTarget, std::move(SubtargetInfo), std::move(InstrInfo), Clustering, + AnalysisInconsistencyEpsilon, AnalysisDisplayUnstableOpcodes); maybeRunAnalysis(Analyzer, "analysis clusters", AnalysisClustersOutputFile);