diff --git a/llvm/include/llvm/Support/LinearAlgebra.h b/llvm/include/llvm/Support/LinearAlgebra.h --- a/llvm/include/llvm/Support/LinearAlgebra.h +++ b/llvm/include/llvm/Support/LinearAlgebra.h @@ -170,11 +170,27 @@ return m; } +/// Are all elements of this matrix, except the ones on the diagonal, a zeros? +template +bool isDiagonalMatrix(MatrixInterface &LHS) { + if (!isSquareMatrix(LHS)) + return false; + for (int row = 0; row != LHS.getNumRows(); ++row) { + for (int col = 0; col != LHS.getNumColumns(); ++col) { + if (col == row) + continue; + if (LHS(row, col) != 0) + return false; + } + } + return true; +} + /// Are all elements of this matrix zeros, except the elements on the main /// diagonal, which are ones? template bool isIdentityMatrix(MatrixInterface &LHS) { - if (!isSquareMatrix(LHS)) + if (!isDiagonalMatrix(LHS)) return false; for (int row = 0; row != LHS.getNumRows(); ++row) { for (int col = 0; col != LHS.getNumColumns(); ++col) { @@ -311,8 +327,7 @@ } T &pivotElement = LHS(pivotRow, column); - if (pivotElement == 0) - continue; + assert(pivotElement != 0); divideRow(LHS, pivotRow, pivotElement); pivotElement = 1.0; // Account for floating point rounding issues. @@ -388,16 +403,37 @@ /// (X^T * X)^-1 * X^T * y template -Matrix getOrdinaryLeastSquaresEstimation(MatrixInterface &LHS, - MatrixInterface &RHS) { - assert(LHS.getNumRows() >= LHS.getNumColumns()); - assert(LHS.getNumRows() == RHS.getNumRows()); - assert(RHS.getNumColumns() == 1); +Matrix getOrdinaryLeastSquaresEstimator(MatrixInterface &XMat, + MatrixInterface &YVec) { + assert(XMat.getNumRows() >= XMat.getNumColumns()); + assert(XMat.getNumRows() == YVec.getNumRows()); + assert(YVec.getNumColumns() == 1); - auto LHSNormal = getNormalMatrix(LHS); - auto LHSNormalInverse = getInverseMatrix(LHSNormal); - auto LHSRHSMoment = getMomentMatrix(LHS, RHS); - return LHSNormalInverse * LHSRHSMoment; + auto XMatTransposed = XMat.getTransposedMatrix(); + Matrix XMatNormal = XMatTransposed * XMat; + Matrix XMatNormalInverse = getInverseMatrix(XMatNormal); + Matrix XYMoment = XMatTransposed * YVec; + return XMatNormalInverse * XYMoment; +} + +/// (X^T * W * X)^-1 * X^T * W * y +template +Matrix getWeightedLeastSquaresEstimator(MatrixInterface &XMat, + MatrixInterface &YVec, + MatrixInterface &WMat) { + assert(XMat.getNumRows() >= XMat.getNumColumns()); + assert(isDiagonalMatrix(WMat)); + assert(XMat.getNumRows() == WMat.getNumRows()); + assert(XMat.getNumRows() == YVec.getNumRows()); + assert(YVec.getNumColumns() == 1); + + auto XMatTransposed = XMat.getTransposedMatrix(); + Matrix XMatTransposedWeighted = XMatTransposed * WMat; + Matrix XMatTransposedWeightedNormal = XMatTransposedWeighted * XMat; + Matrix XMatTransposedWeightedNormalInverse = + getInverseMatrix(XMatTransposedWeightedNormal); + Matrix XYMomentWeighted = XMatTransposedWeighted * YVec; + return XMatTransposedWeightedNormalInverse * XYMomentWeighted; } ///--------------------------------------------------------------------------/// diff --git a/llvm/lib/MC/MCSchedule.cpp b/llvm/lib/MC/MCSchedule.cpp --- a/llvm/lib/MC/MCSchedule.cpp +++ b/llvm/lib/MC/MCSchedule.cpp @@ -160,6 +160,7 @@ for (const MCReadAdvanceEntry &E : Entries) { if (E.WriteResourceID != WriteResourceID) continue; + llvm::errs() << "&E = " << &E << ", cycles = " << E.Cycles << "\n"; DelayCycles = std::min(DelayCycles, E.Cycles); } diff --git a/llvm/lib/Target/X86/X86ScheduleZnver3.td b/llvm/lib/Target/X86/X86ScheduleZnver3.td --- a/llvm/lib/Target/X86/X86ScheduleZnver3.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver3.td @@ -482,7 +482,7 @@ // AMD SOG 19h, 2.11 Floating-Point Unit // There is 1 cycle of added latency for a result to cross // from F to I or I to F domain. -def : ReadAdvance; +def : ReadAdvance; // Instructions with both a load and a store folded are modeled as a folded // load + WriteRMW. diff --git a/llvm/test/tools/llvm-exegesis/X86/analysis-latency-instruction-chaining-domain-transfer.test b/llvm/test/tools/llvm-exegesis/X86/analysis-latency-instruction-chaining-domain-transfer.test new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/X86/analysis-latency-instruction-chaining-domain-transfer.test @@ -0,0 +1,52 @@ +# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-clusters-output-file=- -analysis-clustering-epsilon=0.5 -analysis-inconsistency-epsilon=0.5 -analysis-numpoints=1 | FileCheck -check-prefixes=CHECK %s + +# CHECK: {{^}}cluster_id,opcode_name,config,sched_class,latency{{$}} + +# CHECK-NEXT: {{^}}0, +# CHECK-SAME: ,10.08{{$}} + +# CHECK: {{^}}1, +# CHECK-SAME: ,11.07{{$}} + +# PINSRBrr has latency of 2 cycles. (the value from scheduling profile!) +# But int to fpu units data transfer causes additional latency of 10 cycles. +# Thus the actual latency of VPEXTRBrr is 10..11. + +--- +mode: latency +key: + instructions: + - 'VPEXTRBrr R15D XMM3 i_0x1' + - 'PINSRBrr XMM3 XMM3 R15D i_0x1' + config: '' + register_initial_values: + - 'XMM3=0x0' +cpu_name: bdver2 +llvm_triple: x86_64-unknown-linux-gnu +num_repetitions: 10000 +measurements: + - { key: latency, value: 0.0000, per_snippet_value: 22.0802 } +error: '' +info: Repeating two instructions +assembled_snippet: 41574883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F1C244883C410C4C37914DF0166410F3A20DF01C4C37914DF0166410F3A20DF01C4C37914DF0166410F3A20DF01C4C37914DF0166410F3A20DF01C4C37914DF0166410F3A20DF01C4C37914DF0166410F3A20DF01C4C37914DF0166410F3A20DF01C4C37914DF0166410F3A20DF01415FC3 +... +--- +mode: latency +key: + instructions: + - 'PEXTRBrr ESI XMM7 i_0x1' + - 'VCVTSI642SSrr XMM7 XMM12 RSI' + config: '' + register_initial_values: + - 'XMM7=0x0' + - 'XMM12=0x0' + - 'RSI=0x0' +cpu_name: bdver2 +llvm_triple: x86_64-unknown-linux-gnu +num_repetitions: 10000 +measurements: + - { key: latency, value: 12.533, per_snippet_value: 25.066 } +error: '' +info: Repeating two instructions +assembled_snippet: 4883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F3C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F24244883C41048BE0000000000000000660F3A14FE01C4E19A2AFE660F3A14FE01C4E19A2AFE660F3A14FE01C4E19A2AFE660F3A14FE01C4E19A2AFE660F3A14FE01C4E19A2AFE660F3A14FE01C4E19A2AFE660F3A14FE01C4E19A2AFE660F3A14FE01C4E19A2AFEC3 +... diff --git a/llvm/test/tools/llvm-exegesis/X86/analysis-latency-instruction-chaining.test b/llvm/test/tools/llvm-exegesis/X86/analysis-latency-instruction-chaining.test new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/X86/analysis-latency-instruction-chaining.test @@ -0,0 +1,57 @@ +# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-clusters-output-file=- -analysis-clustering-epsilon=0.5 -analysis-inconsistency-epsilon=0.5 -analysis-numpoints=1 | FileCheck -check-prefixes=CHECK %s + +# CHECK: {{^}}cluster_id,opcode_name,config,sched_class,latency{{$}} + +# CHECK-NEXT: {{^}}0, +# CHECK-SAME: ,1.00{{$}} +# CHECK-NEXT: {{^}}0, +# CHECK-SAME: ,1.00{{$}} + +# Instructions were executed serially, meaning that the next instruction +# *ONLY* starts executing when the current instruction finishes. +# Thus, the real latency of the first instruction is the per_snippet_value minus +# the sum of latencies of all the other instructions in the snippet. + +# RCR8rCL has latency of 11. (the value from scheduling profile!) +# Latency of whole snipped is 12 or 23. (not measured, hand-written.) +# Thus, latency of BT32rr is 12-11 = 1, or 23-11-11 = 1 + +--- +mode: latency +key: + instructions: + - 'BT32rr R11D R11D' + - 'RCR8rCL R11B R11B' + config: '' + register_initial_values: + - 'R11D=0x0' + - 'R11B=0x0' + - 'CL=0x0' +cpu_name: bdver2 +llvm_triple: x86_64-unknown-linux-gnu +num_repetitions: 10000 +measurements: + - { key: latency, value: 50.0000, per_snippet_value: 100.0000 } +error: '' +info: Repeating two instructions +assembled_snippet: 41BB0000000041B300B100450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DBC3 +... +--- +mode: latency +key: + instructions: + - 'RCR8rCL R11B R11B' + config: '' + register_initial_values: + - 'R11D=0x0' + - 'R11B=0x0' + - 'CL=0x0' +cpu_name: bdver2 +llvm_triple: x86_64-unknown-linux-gnu +num_repetitions: 10000 +measurements: + - { key: latency, value: 25.0000, per_snippet_value: 25.0000 } +error: '' +info: Repeating two instructions +assembled_snippet: 41BB0000000041B300B100450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DBC3 +... diff --git a/llvm/tools/llvm-exegesis/lib/Analysis.h b/llvm/tools/llvm-exegesis/lib/Analysis.h --- a/llvm/tools/llvm-exegesis/lib/Analysis.h +++ b/llvm/tools/llvm-exegesis/lib/Analysis.h @@ -37,6 +37,7 @@ class Analysis { public: Analysis(const Target &Target, std::unique_ptr InstrInfo, + std::unique_ptr SubtargetInfo, const InstructionBenchmarkClustering &Clustering, double AnalysisInconsistencyEpsilon, bool AnalysisDisplayUnstableOpcodes, diff --git a/llvm/tools/llvm-exegesis/lib/Analysis.cpp b/llvm/tools/llvm-exegesis/lib/Analysis.cpp --- a/llvm/tools/llvm-exegesis/lib/Analysis.cpp +++ b/llvm/tools/llvm-exegesis/lib/Analysis.cpp @@ -152,11 +152,13 @@ } Analysis::Analysis(const Target &Target, std::unique_ptr InstrInfo, + std::unique_ptr SubtargetInfo, const InstructionBenchmarkClustering &Clustering, double AnalysisInconsistencyEpsilon, bool AnalysisDisplayUnstableOpcodes, const std::string &ForceCpuName) - : Clustering_(Clustering), InstrInfo_(std::move(InstrInfo)), + : Clustering_(Clustering), SubtargetInfo_(std::move(SubtargetInfo)), + InstrInfo_(std::move(InstrInfo)), AnalysisInconsistencyEpsilonSquared_(AnalysisInconsistencyEpsilon * AnalysisInconsistencyEpsilon), AnalysisDisplayUnstableOpcodes_(AnalysisDisplayUnstableOpcodes) { @@ -170,8 +172,6 @@ MCTargetOptions MCOptions; AsmInfo_.reset( Target.createMCAsmInfo(*RegInfo_, FirstPoint.LLVMTriple, MCOptions)); - SubtargetInfo_.reset( - Target.createMCSubtargetInfo(FirstPoint.LLVMTriple, CpuName, "")); InstPrinter_.reset(Target.createMCInstPrinter( Triple(FirstPoint.LLVMTriple), 0 /*default variant*/, *AsmInfo_, *InstrInfo_, *RegInfo_)); @@ -290,6 +290,8 @@ default: llvm_unreachable("invalid mode"); } + if (Point.Info == "WLS fixpoint" || Point.Info == "WLS reconstruction") + OS << " (" << Point.Info << ")"; OS << " "; writeEscaped(OS, Point.Key.Config); OS << ""; diff --git a/llvm/tools/llvm-exegesis/lib/CMakeLists.txt b/llvm/tools/llvm-exegesis/lib/CMakeLists.txt --- a/llvm/tools/llvm-exegesis/lib/CMakeLists.txt +++ b/llvm/tools/llvm-exegesis/lib/CMakeLists.txt @@ -55,6 +55,7 @@ MCInstrDescView.cpp ParallelSnippetGenerator.cpp PerfHelper.cpp + PostProcessing.cpp RegisterAliasing.cpp RegisterValue.cpp SchedClassResolution.cpp diff --git a/llvm/tools/llvm-exegesis/lib/PostProcessing.h b/llvm/tools/llvm-exegesis/lib/PostProcessing.h new file mode 100644 --- /dev/null +++ b/llvm/tools/llvm-exegesis/lib/PostProcessing.h @@ -0,0 +1,31 @@ +//===-- PostProcessing.h ----------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Post-processing for the benchmark points. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_EXEGESIS_POSTPROCESSING_H +#define LLVM_TOOLS_LLVM_EXEGESIS_POSTPROCESSING_H + +#include "BenchmarkResult.h" +#include + +namespace llvm { +namespace exegesis { + +void PostProcessChainedLatencyBenchmarkPoints( + std::vector &Points, + const llvm::MCInstrInfo &InstrInfo, + const llvm::MCSubtargetInfo &SubtargetInfo); + +} // namespace exegesis +} // namespace llvm + +#endif // LLVM_TOOLS_LLVM_EXEGESIS_POSTPROCESSING_H diff --git a/llvm/tools/llvm-exegesis/lib/PostProcessing.cpp b/llvm/tools/llvm-exegesis/lib/PostProcessing.cpp new file mode 100644 --- /dev/null +++ b/llvm/tools/llvm-exegesis/lib/PostProcessing.cpp @@ -0,0 +1,178 @@ +//===-- PostProcessing.cpp --------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "PostProcessing.h" +#include "Clustering.h" +#include "SchedClassResolution.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/Support/LinearAlgebra.h" +#include + +namespace llvm { +namespace exegesis { + +static std::pair +getAdjustedLatency(const InstructionBenchmark *RelevantPoint, + const llvm::MCInstrInfo &InstrInfo, + const llvm::MCSubtargetInfo &SubtargetInfo) { + assert(RelevantPoint->Measurements.size() == 1); + assert(RelevantPoint->Measurements[0].Key == "latency"); + double Latency = RelevantPoint->Measurements[0].PerSnippetValue; + bool HaveForwardingDelays = false; + + if (RelevantPoint->Key.Instructions.size() == 1) + return {Latency, HaveForwardingDelays}; + + const MCSchedModel &SM = SubtargetInfo.getSchedModel(); + for (const MCInst &Inst : RelevantPoint->Key.Instructions) { + const MCInstrDesc &MCDesc = InstrInfo.get(Inst.getOpcode()); + + // Obtain the scheduling class information from the instruction. + unsigned SchedClassID = MCDesc.getSchedClass(); + unsigned CPUID = SM.getProcessorID(); + + // Try to solve variant scheduling classes. + while (SchedClassID && SM.getSchedClassDesc(SchedClassID)->isVariant()) + SchedClassID = SubtargetInfo.resolveVariantSchedClass(SchedClassID, &Inst, + &InstrInfo, CPUID); + + const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID); + unsigned ForwardingDelayCycles = MCSchedModel::getForwardingDelayCycles( + SubtargetInfo.getReadAdvanceEntries(SCDesc)); + HaveForwardingDelays |= ForwardingDelayCycles != 0; + } + + return {Latency, HaveForwardingDelays}; +} + +void PostProcessChainedLatencyBenchmarkPoints( + std::vector &Points, + const llvm::MCInstrInfo &InstrInfo, + const llvm::MCSubtargetInfo &SubtargetInfo) { + auto IsLatencyPoint = [](const InstructionBenchmark &Point) { + return Point.Mode == InstructionBenchmark::ModeE::Latency && + Point.Error.empty(); + }; + auto IsChainedLatencyPoint = + [IsLatencyPoint](const InstructionBenchmark &Point) { + return IsLatencyPoint(Point) && Point.Key.Instructions.size() >= 2; + }; + + unsigned NumOpcodes = InstrInfo.getNumOpcodes(); + + // Which opcodes were ever chained in all of the benchmark points? + // Only record the ones for which we succeeded in measuring latency. + std::vector OpcodeToIndex(NumOpcodes, /*Index=*/-1); + std::vector IndexToOpcode; + IndexToOpcode.reserve(NumOpcodes); + for (const InstructionBenchmark &Point : + make_filter_range(Points, IsChainedLatencyPoint)) { + for (const MCInst &Instruction : Point.Key.Instructions) { + const unsigned Opcode = Instruction.getOpcode(); + assert(Opcode < NumOpcodes && "NumOpcodes is incorrect (too small)"); + if (OpcodeToIndex[Opcode] != -1) // Already seen chained? + continue; + OpcodeToIndex[Opcode] = IndexToOpcode.size(); + IndexToOpcode.emplace_back(Instruction); + } + } + + if (IndexToOpcode.empty()) + return; // Lucky us. + + // Remember all the points that contained any opcode that was ever chained. + // We can not do this in the previous loop, because if Opc0 was chained with + // Opc1, we'll miss all standalone points for Opc1 before seeing the chaining. + // We store indexes into Points to avoid iterator invalidation. + SmallVector RelevantPoints; + for (const InstructionBenchmark &Point : + make_filter_range(Points, IsLatencyPoint)) { + if (any_of(Point.Key.Instructions, + [OpcodeToIndex](const MCInst &Instruction) { + return OpcodeToIndex[Instruction.getOpcode()] != -1; + })) + RelevantPoints.emplace_back(&Point); + } + + using namespace linearalgebra; + + errs() << "rows = " << RelevantPoints.size() << "\n"; + errs() << "cols = " << IndexToOpcode.size() << "\n"; + Matrix OpcodeChaining(RelevantPoints.size(), IndexToOpcode.size()); + Matrix ForwardingDelayPresence(RelevantPoints.size(), 1); + Matrix SnippetLatency(RelevantPoints.size(), 1); + Matrix Weights = getIdentityMatrix(RelevantPoints.size()); + + std::vector FixpointOpcodes(NumOpcodes, /*IsFixpoint=*/false); + + for (auto I : enumerate(RelevantPoints)) { + int row = I.index(); + const InstructionBenchmark *RelevantPoint = I.value(); + + // FIXME: this + std::tie(SnippetLatency(row, 0), ForwardingDelayPresence(row, 0)) = + getAdjustedLatency(RelevantPoint, InstrInfo, SubtargetInfo); + + if (RelevantPoint->Key.Instructions.size() == 1) { + // Give a(n arbitrary) bonus to the points that directly measured + // one single specific instruction. We believe these measurements + // to be precise, so the fit should basically not change them. + Weights(row, row) = 1e+3; + FixpointOpcodes[RelevantPoint->Key.Instructions.front().getOpcode()] = + true; + } + + for (const MCInst &Instrn : RelevantPoint->Key.Instructions) { + double &Entry = OpcodeChaining(row, OpcodeToIndex[Instrn.getOpcode()]); + assert(Entry == 0); + Entry = 1; + } + } + RelevantPoints.clear(); + OpcodeToIndex.clear(); + + auto Zz = getAugmentedMatrix(OpcodeChaining, ForwardingDelayPresence); + auto EstimatedInstructionLatencies = + getWeightedLeastSquaresEstimator(Zz, SnippetLatency, Weights); + assert(EstimatedInstructionLatencies.getNumColumns() == 1); + assert((size_t)EstimatedInstructionLatencies.getNumRows() >= + IndexToOpcode.size()); + + // Points.erase( + // std::remove_if(Points.begin(), Points.end(), IsChainedLatencyPoint), + // Points.end()); + Points.clear(); + + Points.reserve(Points.size() + IndexToOpcode.size()); + for (auto I : enumerate(IndexToOpcode)) { + const MCInst &Instruction = I.value(); + const double EstimatedInstructionLatency = + EstimatedInstructionLatencies(I.index(), 0); + + Points.emplace_back(); + InstructionBenchmark &NewPoint = Points.back(); + + NewPoint.Key.Instructions.emplace_back(Instruction); + NewPoint.Mode = InstructionBenchmark::Latency; + NewPoint.CpuName = Points.front().CpuName; + NewPoint.LLVMTriple = Points.front().LLVMTriple; + NewPoint.Measurements.emplace_back( + BenchmarkMeasure::Create("latency", EstimatedInstructionLatency)); + NewPoint.Info = FixpointOpcodes[Instruction.getOpcode()] + ? "WLS fixpoint" + : "WLS reconstruction"; + } + for (int I = IndexToOpcode.size(); + I < EstimatedInstructionLatencies.getNumRows(); ++I) + errs() << "Reconstructed forwaring delay = " + << EstimatedInstructionLatencies(I, 0) << "\n"; +} + +} // namespace exegesis +} // namespace llvm diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp --- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp +++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp @@ -18,6 +18,7 @@ #include "lib/Error.h" #include "lib/LlvmState.h" #include "lib/PerfHelper.h" +#include "lib/PostProcessing.h" #include "lib/SnippetFile.h" #include "lib/SnippetRepetitor.h" #include "lib/Target.h" @@ -409,7 +410,7 @@ // Read benchmarks. const LLVMState State(""); - const std::vector Points = ExitOnFileError( + std::vector Points = ExitOnFileError( BenchmarkFile, InstructionBenchmark::readYamls(State, BenchmarkFile)); outs() << "Parsed " << Points.size() << " benchmark points\n"; @@ -420,6 +421,7 @@ // FIXME: Check that all points have the same triple/cpu. // FIXME: Merge points from several runs (latency and uops). + std::string Error; const auto *TheTarget = TargetRegistry::lookupTarget(Points[0].LLVMTriple, Error); @@ -431,13 +433,19 @@ std::unique_ptr InstrInfo(TheTarget->createMCInstrInfo()); assert(InstrInfo && "Unable to create instruction info!"); + std::unique_ptr SubtargetInfo( + TheTarget->createMCSubtargetInfo(Points[0].LLVMTriple, CpuName, "")); + assert(SubtargetInfo && "Unable to create subtarget info!"); + + PostProcessChainedLatencyBenchmarkPoints(Points, *InstrInfo, *SubtargetInfo); + const auto Clustering = ExitOnErr(InstructionBenchmarkClustering::create( Points, AnalysisClusteringAlgorithm, AnalysisDbscanNumPoints, AnalysisClusteringEpsilon, InstrInfo->getNumOpcodes())); - const Analysis Analyzer(*TheTarget, std::move(InstrInfo), Clustering, - AnalysisInconsistencyEpsilon, - AnalysisDisplayUnstableOpcodes, CpuName); + const Analysis Analyzer( + *TheTarget, std::move(InstrInfo), std::move(SubtargetInfo), Clustering, + AnalysisInconsistencyEpsilon, AnalysisDisplayUnstableOpcodes, CpuName); maybeRunAnalysis(Analyzer, "analysis clusters", AnalysisClustersOutputFile); diff --git a/llvm/unittests/Support/LinearAlgebraTest.cpp b/llvm/unittests/Support/LinearAlgebraTest.cpp --- a/llvm/unittests/Support/LinearAlgebraTest.cpp +++ b/llvm/unittests/Support/LinearAlgebraTest.cpp @@ -175,7 +175,7 @@ y(0, 0) = 5; y(1, 0) = 6; - auto beta = getOrdinaryLeastSquaresEstimation(M, y); + auto beta = getOrdinaryLeastSquaresEstimator(M, y); EXPECT_EQ(std::vector({4, 1. / 2}), getAllValues(beta)); } @@ -199,7 +199,7 @@ y(2, 0) = 7; y(3, 0) = 10; - auto beta = getOrdinaryLeastSquaresEstimation(M, y); + auto beta = getOrdinaryLeastSquaresEstimator(M, y); EXPECT_EQ(std::vector({7. / 2, 7. / 5}), getAllValues(beta)); }