diff --git a/llvm/include/llvm/Support/LinearAlgebra.h b/llvm/include/llvm/Support/LinearAlgebra.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/Support/LinearAlgebra.h @@ -0,0 +1,444 @@ +//===-- llvm/Support/LinearAlgebra.h - Matrices and operations --*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains some functions that are useful for math stuff. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_LINEARALGEBRA_H +#define LLVM_SUPPORT_LINEARALGEBRA_H + +#include +#include +#include +#include +#include +#include + +namespace llvm { +namespace linearalgebra { + +///--------------------------------------------------------------------------/// + +template class TransposedMatrix; + +template struct MatrixInterface { + using value_type = T; + + // MatrixInterface() = delete; + MatrixInterface &operator=(MatrixInterface) = delete; + MatrixInterface &operator=(const MatrixInterface &) = delete; + + int getNumRows() const { return ((const Derived *)this)->getNumRows(); } + int getNumColumns() const { return ((const Derived *)this)->getNumColumns(); } + + T &operator()(int row, int col) { + return ((Derived *)this)->operator()(row, col); + } + + /// X^T + TransposedMatrix getTransposedMatrix() { + return TransposedMatrix(*((Derived *)this)); + } +}; + +///--------------------------------------------------------------------------/// + +template +class Matrix : public MatrixInterface> { +public: + Matrix(int num_rows, int num_cols) : num_rows(num_rows), num_cols(num_cols) { + storage.resize(num_rows * num_cols); + } + + // Disallow copying. + // Matrix(const Matrix &) = delete; + // Matrix(Matrix &&) = delete; + // Matrix &operator=(Matrix) = delete; + // Matrix &operator=(const Matrix &) = delete; + // Matrix &operator=(Matrix &&) = delete; + + int getNumRows() const { return num_rows; }; + int getNumColumns() const { return num_cols; }; + + T &operator()(int row, int col) { + assert(row >= 0 && row < num_rows); + assert(col >= 0 && col < num_cols); + + return storage[num_cols * row + col]; + } + +private: + std::vector storage; + int num_rows; + int num_cols; +}; + +template +class TransposedMatrix + : public MatrixInterface> { + InnerTy &m; + +public: + TransposedMatrix(InnerTy &m) : m(m) {} + + int getNumRows() const { return m.getNumColumns(); }; + int getNumColumns() const { return m.getNumRows(); }; + + T &operator()(int row, int col) { return m(col, row); } +}; + +template +class AugmentedMatrix + : public MatrixInterface> { + MatrixInterface &a; + MatrixInterface &b; + +public: + AugmentedMatrix(MatrixInterface &a, MatrixInterface &b) + : a(a), b(b) { + assert(a.getNumRows() == b.getNumRows()); + } + + int getNumRows() const { return a.getNumRows(); }; + int getNumColumns() const { return a.getNumColumns() + b.getNumColumns(); }; + + T &operator()(int row, int col) { + assert(col >= 0 && col < getNumColumns()); + if (col < a.getNumColumns()) + return a(row, col); + return b(row, col - a.getNumColumns()); + } +}; + +template +AugmentedMatrix +getAugmentedMatrix(MatrixInterface &a, MatrixInterface &b) { + return AugmentedMatrix(a, b); +} + +///--------------------------------------------------------------------------/// + +template +Matrix cloneMatrix(MatrixInterface &LHS) { + Matrix m(LHS.getNumRows(), LHS.getNumColumns()); + for (int row = 0; row != LHS.getNumRows(); ++row) { + for (int col = 0; col != LHS.getNumColumns(); ++col) { + m(row, col) = LHS(row, col); + } + } + return m; +} + +///--------------------------------------------------------------------------/// + +/// Get a square matrix with all elements being zero except the elements +/// on the diagonal, which are ones. +template +bool isSquareMatrix(const MatrixInterface &LHS) { + return LHS.getNumRows() == LHS.getNumColumns(); +} + +/// Are all the entries below the main diagonal are zero? +template +bool isUpperTriangularMatrix(MatrixInterface &LHS) { + for (int row = 0; row != LHS.getNumRows(); ++row) { + for (int col = 0; col != LHS.getNumColumns(); ++col) { + // Is this element on the diagonal, or to the right of diagonal? + if (col >= row) + continue; + if (LHS(row, col) != 0) + return false; + } + } + return true; +} + +///--------------------------------------------------------------------------/// + +/// Get a square matrix with all elements being zero except the elements +/// on the diagonal, which are ones. +template Matrix getIdentityMatrix(int Size) { + Matrix m(Size, Size); + for (int diagEltIdx = 0; diagEltIdx != Size; ++diagEltIdx) + m(diagEltIdx, diagEltIdx) = 1; + return m; +} + +/// Are all elements of this matrix, except the ones on the diagonal, a zeros? +template +bool isDiagonalMatrix(MatrixInterface &LHS) { + if (!isSquareMatrix(LHS)) + return false; + for (int row = 0; row != LHS.getNumRows(); ++row) { + for (int col = 0; col != LHS.getNumColumns(); ++col) { + if (col == row) + continue; + if (LHS(row, col) != 0) + return false; + } + } + return true; +} + +/// Are all elements of this matrix zeros, except the elements on the main +/// diagonal, which are ones? +template +bool isIdentityMatrix(MatrixInterface &LHS) { + if (!isDiagonalMatrix(LHS)) + return false; + for (int row = 0; row != LHS.getNumRows(); ++row) { + for (int col = 0; col != LHS.getNumColumns(); ++col) { + if (LHS(row, col) != (col == row) ? 1 : 0) + return false; + } + } + return true; +} + +///--------------------------------------------------------------------------/// + +/// On row \p row, which column is the first one to have a non-zero value? +/// Returns number of columns if no such element exists. +template +int getLeadingCoeffientColumn(MatrixInterface &LHS, int row) { + int col = 0; + for (; col != LHS.getNumColumns(); ++col) { + if (LHS(row, col) != 0) + break; + } + return col; +} + +/// Are all elements of the row \p row zeros? +template +int isZeroRow(MatrixInterface &LHS, int row) { + return getLeadingCoeffientColumn(LHS, row) == LHS.getNumColumns(); +} + +template +bool isMatrixInRowEchelonForm(MatrixInterface &LHS) { + assert(isUpperTriangularMatrix(LHS)); + + bool seenZeroRow = false; + for (int row = 0; row != LHS.getNumRows(); ++row) { + bool IsZeroRow = isZeroRow(LHS, row); + seenZeroRow |= IsZeroRow; + if (seenZeroRow) { + if (!IsZeroRow) + return false; + continue; + } + if (row == 0) + continue; + if (getLeadingCoeffientColumn(LHS, row) <= + getLeadingCoeffientColumn(LHS, row - 1)) + return false; + } + return true; +} + +template +bool isMatrixInReducedRowEchelonForm(MatrixInterface &LHS) { + if (!isMatrixInRowEchelonForm(LHS)) + return false; + + for (int pivot = 0; pivot != LHS.getNumRows(); ++pivot) { + if (isZeroRow(LHS, pivot)) + break; + int leadingCoeffientColumn = getLeadingCoeffientColumn(LHS, pivot); + for (int row = 0; row != LHS.getNumRows(); ++row) { + if (LHS(row, leadingCoeffientColumn) != (row == pivot) ? 1 : 0) + return false; + } + } + return true; +} + +///--------------------------------------------------------------------------/// + +// Elementary row operations +template +void swapRows(MatrixInterface &LHS, int rowA, int rowB) { + assert(rowA != rowB); + for (int col = 0; col != LHS.getNumColumns(); ++col) + std::swap(LHS(rowA, col), LHS(rowB, col)); +} +template +void multiplyRow(MatrixInterface &LHS, int row, T scale) { + assert(scale != 0); + for (int col = 0; col != LHS.getNumColumns(); ++col) + LHS(row, col) *= scale; +} +template +void divideRow(MatrixInterface &LHS, int row, T scale) { + assert(scale != 0); + for (int col = 0; col != LHS.getNumColumns(); ++col) + LHS(row, col) /= scale; +} +template +void addRowMultiple(MatrixInterface &LHS, int dstRow, int srcRow, + T scale) { + assert(dstRow != srcRow); + assert(scale != 0); + for (int col = 0; col != LHS.getNumColumns(); ++col) + LHS(dstRow, col) += scale * LHS(srcRow, col); +} +template +void subtractRowMultiple(MatrixInterface &LHS, int dstRow, int srcRow, + T scale) { + assert(dstRow != srcRow); + assert(scale != 0); + for (int col = 0; col != LHS.getNumColumns(); ++col) + LHS(dstRow, col) -= scale * LHS(srcRow, col); +} + +///--------------------------------------------------------------------------/// + +template +int findPivotingRow(MatrixInterface &LHS, int column, int firstRow) { + int pivotRow = firstRow; + T pivotValMag = std::abs(LHS(firstRow, column)); + for (int row = firstRow + 1; row != LHS.getNumRows(); ++row) { + T currValMag = std::abs(LHS(row, column)); + if (currValMag < pivotValMag) + continue; + pivotRow = row; + pivotValMag = currValMag; + } + return pivotRow; +} + +template +void performGaussJordanElimination(MatrixInterface &LHS) { + assert(LHS.getNumRows() <= LHS.getNumColumns()); + for (int pivot = 0; pivot != LHS.getNumRows(); ++pivot) { + int column = pivot; + int pivotRow = findPivotingRow(LHS, column, pivot); + + if (pivotRow != pivot) { + swapRows(LHS, pivotRow, pivot); + pivotRow = pivot; + } + + T &pivotElement = LHS(pivotRow, column); + assert(pivotElement != 0); + + divideRow(LHS, pivotRow, pivotElement); + pivotElement = 1.0; // Account for floating point rounding issues. + + for (int row = 0; row != LHS.getNumRows(); ++row) { + if (row == pivotRow) + continue; + + T &currElement = LHS(row, column); + if (currElement == 0) + continue; + + subtractRowMultiple(LHS, row, pivotRow, currElement); + currElement = 0; // Account for floating point rounding issues. + } + } + assert(isMatrixInReducedRowEchelonForm(LHS)); +} + +/// X^-1 +template +Matrix getInverseMatrix(MatrixInterface &LHS) { + assert(isSquareMatrix(LHS)); + Matrix A = cloneMatrix(LHS); + Matrix B = getIdentityMatrix(LHS.getNumRows()); + auto AB = getAugmentedMatrix(A, B); + performGaussJordanElimination(AB); + assert(isIdentityMatrix(A)); + return B; +} + +///--------------------------------------------------------------------------/// + +/// X * Y +template +Matrix operator*(MatrixInterface &LHS, + MatrixInterface &RHS) { + assert(LHS.getNumColumns() == RHS.getNumRows()); + + Matrix R(LHS.getNumRows(), RHS.getNumColumns()); + for (int row = 0; row != R.getNumRows(); ++row) { + for (int col = 0; col != R.getNumColumns(); ++col) { + for (int k = 0; k != LHS.getNumColumns(); ++k) { + R(row, col) += LHS(row, k) * RHS(k, col); + } + } + } + + return R; +} + +///--------------------------------------------------------------------------/// + +/// X^T * X +template +Matrix getNormalMatrix(MatrixInterface &LHS) { + auto LHSTransposed = LHS.getTransposedMatrix(); + return LHSTransposed * LHS; +} + +/// X^T * y +template +Matrix getMomentMatrix(MatrixInterface &LHS, + MatrixInterface &RHS) { + assert(LHS.getNumRows() == RHS.getNumRows()); + assert(RHS.getNumColumns() == 1); + + auto LHSTransposed = LHS.getTransposedMatrix(); + return LHSTransposed * RHS; +} + +///--------------------------------------------------------------------------/// + +/// (X^T * X)^-1 * X^T * y +template +Matrix getOrdinaryLeastSquaresEstimator(MatrixInterface &XMat, + MatrixInterface &YVec) { + assert(XMat.getNumRows() >= XMat.getNumColumns()); + assert(XMat.getNumRows() == YVec.getNumRows()); + assert(YVec.getNumColumns() == 1); + + auto XMatTransposed = XMat.getTransposedMatrix(); + Matrix XMatNormal = XMatTransposed * XMat; + Matrix XMatNormalInverse = getInverseMatrix(XMatNormal); + Matrix XYMoment = XMatTransposed * YVec; + return XMatNormalInverse * XYMoment; +} + +/// (X^T * W * X)^-1 * X^T * W * y +template +Matrix getWeightedLeastSquaresEstimator(MatrixInterface &XMat, + MatrixInterface &YVec, + MatrixInterface &WMat) { + assert(XMat.getNumRows() >= XMat.getNumColumns()); + assert(isDiagonalMatrix(WMat)); + assert(XMat.getNumRows() == WMat.getNumRows()); + assert(XMat.getNumRows() == YVec.getNumRows()); + assert(YVec.getNumColumns() == 1); + + auto XMatTransposed = XMat.getTransposedMatrix(); + Matrix XMatTransposedWeighted = XMatTransposed * WMat; + Matrix XMatTransposedWeightedNormal = XMatTransposedWeighted * XMat; + Matrix XMatTransposedWeightedNormalInverse = + getInverseMatrix(XMatTransposedWeightedNormal); + Matrix XYMomentWeighted = XMatTransposedWeighted * YVec; + return XMatTransposedWeightedNormalInverse * XYMomentWeighted; +} + +///--------------------------------------------------------------------------/// + +} // namespace linearalgebra +} // namespace llvm + +#endif diff --git a/llvm/test/tools/llvm-exegesis/X86/analysis-latency-instruction-chaining-domain-transfer.test b/llvm/test/tools/llvm-exegesis/X86/analysis-latency-instruction-chaining-domain-transfer.test new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/X86/analysis-latency-instruction-chaining-domain-transfer.test @@ -0,0 +1,52 @@ +# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-clusters-output-file=- -analysis-clustering-epsilon=0.5 -analysis-inconsistency-epsilon=0.5 -analysis-numpoints=1 | FileCheck -check-prefixes=CHECK %s + +# CHECK: {{^}}cluster_id,opcode_name,config,sched_class,latency{{$}} + +# CHECK-NEXT: {{^}}0, +# CHECK-SAME: ,10.08{{$}} + +# CHECK: {{^}}1, +# CHECK-SAME: ,11.07{{$}} + +# PINSRBrr has latency of 2 cycles. (the value from scheduling profile!) +# But int to fpu units data transfer causes additional latency of 10 cycles. +# Thus the actual latency of VPEXTRBrr is 10..11. + +--- +mode: latency +key: + instructions: + - 'VPEXTRBrr R15D XMM3 i_0x1' + - 'PINSRBrr XMM3 XMM3 R15D i_0x1' + config: '' + register_initial_values: + - 'XMM3=0x0' +cpu_name: bdver2 +llvm_triple: x86_64-unknown-linux-gnu +num_repetitions: 10000 +measurements: + - { key: latency, value: 0.0000, per_snippet_value: 22.0802 } +error: '' +info: Repeating two instructions +assembled_snippet: 41574883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F1C244883C410C4C37914DF0166410F3A20DF01C4C37914DF0166410F3A20DF01C4C37914DF0166410F3A20DF01C4C37914DF0166410F3A20DF01C4C37914DF0166410F3A20DF01C4C37914DF0166410F3A20DF01C4C37914DF0166410F3A20DF01C4C37914DF0166410F3A20DF01415FC3 +... +--- +mode: latency +key: + instructions: + - 'PEXTRBrr ESI XMM7 i_0x1' + - 'VCVTSI642SSrr XMM7 XMM12 RSI' + config: '' + register_initial_values: + - 'XMM7=0x0' + - 'XMM12=0x0' + - 'RSI=0x0' +cpu_name: bdver2 +llvm_triple: x86_64-unknown-linux-gnu +num_repetitions: 10000 +measurements: + - { key: latency, value: 12.533, per_snippet_value: 25.066 } +error: '' +info: Repeating two instructions +assembled_snippet: 4883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F3C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F24244883C41048BE0000000000000000660F3A14FE01C4E19A2AFE660F3A14FE01C4E19A2AFE660F3A14FE01C4E19A2AFE660F3A14FE01C4E19A2AFE660F3A14FE01C4E19A2AFE660F3A14FE01C4E19A2AFE660F3A14FE01C4E19A2AFE660F3A14FE01C4E19A2AFEC3 +... diff --git a/llvm/test/tools/llvm-exegesis/X86/analysis-latency-instruction-chaining.test b/llvm/test/tools/llvm-exegesis/X86/analysis-latency-instruction-chaining.test new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/X86/analysis-latency-instruction-chaining.test @@ -0,0 +1,57 @@ +# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-clusters-output-file=- -analysis-clustering-epsilon=0.5 -analysis-inconsistency-epsilon=0.5 -analysis-numpoints=1 | FileCheck -check-prefixes=CHECK %s + +# CHECK: {{^}}cluster_id,opcode_name,config,sched_class,latency{{$}} + +# CHECK-NEXT: {{^}}0, +# CHECK-SAME: ,1.00{{$}} +# CHECK-NEXT: {{^}}0, +# CHECK-SAME: ,1.00{{$}} + +# Instructions were executed serially, meaning that the next instruction +# *ONLY* starts executing when the current instruction finishes. +# Thus, the real latency of the first instruction is the per_snippet_value minus +# the sum of latencies of all the other instructions in the snippet. + +# RCR8rCL has latency of 11. (the value from scheduling profile!) +# Latency of whole snipped is 12 or 23. (not measured, hand-written.) +# Thus, latency of BT32rr is 12-11 = 1, or 23-11-11 = 1 + +--- +mode: latency +key: + instructions: + - 'BT32rr R11D R11D' + - 'RCR8rCL R11B R11B' + config: '' + register_initial_values: + - 'R11D=0x0' + - 'R11B=0x0' + - 'CL=0x0' +cpu_name: bdver2 +llvm_triple: x86_64-unknown-linux-gnu +num_repetitions: 10000 +measurements: + - { key: latency, value: 50.0000, per_snippet_value: 100.0000 } +error: '' +info: Repeating two instructions +assembled_snippet: 41BB0000000041B300B100450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DBC3 +... +--- +mode: latency +key: + instructions: + - 'RCR8rCL R11B R11B' + config: '' + register_initial_values: + - 'R11D=0x0' + - 'R11B=0x0' + - 'CL=0x0' +cpu_name: bdver2 +llvm_triple: x86_64-unknown-linux-gnu +num_repetitions: 10000 +measurements: + - { key: latency, value: 25.0000, per_snippet_value: 25.0000 } +error: '' +info: Repeating two instructions +assembled_snippet: 41BB0000000041B300B100450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DBC3 +... diff --git a/llvm/tools/llvm-exegesis/lib/Analysis.cpp b/llvm/tools/llvm-exegesis/lib/Analysis.cpp --- a/llvm/tools/llvm-exegesis/lib/Analysis.cpp +++ b/llvm/tools/llvm-exegesis/lib/Analysis.cpp @@ -289,6 +289,8 @@ default: llvm_unreachable("invalid mode"); } + if (Point.Info == "OLS fit") + OS << " (" << Point.Info << ")"; OS << " "; writeEscaped(OS, Point.Key.Config); OS << ""; diff --git a/llvm/tools/llvm-exegesis/lib/CMakeLists.txt b/llvm/tools/llvm-exegesis/lib/CMakeLists.txt --- a/llvm/tools/llvm-exegesis/lib/CMakeLists.txt +++ b/llvm/tools/llvm-exegesis/lib/CMakeLists.txt @@ -55,6 +55,7 @@ MCInstrDescView.cpp ParallelSnippetGenerator.cpp PerfHelper.cpp + PostProcessing.cpp RegisterAliasing.cpp RegisterValue.cpp SchedClassResolution.cpp diff --git a/llvm/tools/llvm-exegesis/lib/PostProcessing.h b/llvm/tools/llvm-exegesis/lib/PostProcessing.h new file mode 100644 --- /dev/null +++ b/llvm/tools/llvm-exegesis/lib/PostProcessing.h @@ -0,0 +1,29 @@ +//===-- PostProcessing.h ----------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Post-processing for the benchmark points. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_EXEGESIS_POSTPROCESSING_H +#define LLVM_TOOLS_LLVM_EXEGESIS_POSTPROCESSING_H + +#include "BenchmarkResult.h" +#include + +namespace llvm { +namespace exegesis { + +void PostProcessChainedLatencyBenchmarkPoints( + std::vector &Points, unsigned NumOpcodes); + +} // namespace exegesis +} // namespace llvm + +#endif // LLVM_TOOLS_LLVM_EXEGESIS_POSTPROCESSING_H diff --git a/llvm/tools/llvm-exegesis/lib/PostProcessing.cpp b/llvm/tools/llvm-exegesis/lib/PostProcessing.cpp new file mode 100644 --- /dev/null +++ b/llvm/tools/llvm-exegesis/lib/PostProcessing.cpp @@ -0,0 +1,121 @@ +//===-- PostProcessing.cpp --------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "PostProcessing.h" +#include "Clustering.h" +#include "SchedClassResolution.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/Support/LinearAlgebra.h" +#include + +namespace llvm { +namespace exegesis { + +void PostProcessChainedLatencyBenchmarkPoints( + std::vector &Points, unsigned NumOpcodes) { + auto IsLatencyPoint = [](const InstructionBenchmark &Point) { + return Point.Mode == InstructionBenchmark::ModeE::Latency && + Point.Error.empty(); + }; + auto IsChainedLatencyPoint = + [IsLatencyPoint](const InstructionBenchmark &Point) { + return IsLatencyPoint(Point) && Point.Key.Instructions.size() >= 2; + }; + + // Which opcodes were ever chained in all of the benchmark points? + // Only record the ones for which we succeeded in measuring latency. + std::vector OpcodeToIndex(NumOpcodes, /*Index=*/-1); + std::vector IndexToOpcode; + IndexToOpcode.reserve(NumOpcodes); + for (const InstructionBenchmark &Point : + make_filter_range(Points, IsChainedLatencyPoint)) { + for (const MCInst &Instruction : Point.Key.Instructions) { + const unsigned Opcode = Instruction.getOpcode(); + assert(Opcode < NumOpcodes && "NumOpcodes is incorrect (too small)"); + if (OpcodeToIndex[Opcode] != -1) // Already seen chained? + continue; + OpcodeToIndex[Opcode] = IndexToOpcode.size(); + IndexToOpcode.emplace_back(Instruction); + } + } + + if (IndexToOpcode.empty()) + return; // Lucky us. + + // Remember all the points that contained any opcode that was ever chained. + // We can not do this in the previous loop, because if Opc0 was chained with + // Opc1, we'll miss all standalone points for Opc1 before seeing the chaining. + // We store indexes into Points to avoid iterator invalidation. + SmallVector RelevantPoints; + for (const InstructionBenchmark &Point : + make_filter_range(Points, IsLatencyPoint)) { + if (any_of(Point.Key.Instructions, + [OpcodeToIndex](const MCInst &Instruction) { + return OpcodeToIndex[Instruction.getOpcode()] != -1; + })) + RelevantPoints.emplace_back(&Point); + } + + using namespace linearalgebra; + + errs() << "rows = " << RelevantPoints.size() << "\n"; + errs() << "cols = " << IndexToOpcode.size() << "\n"; + Matrix OpcodeChaining(RelevantPoints.size(), IndexToOpcode.size()); + Matrix SnippetLatency(RelevantPoints.size(), 1); + Matrix Weights = getIdentityMatrix(RelevantPoints.size()); + + for (auto I : enumerate(RelevantPoints)) { + int row = I.index(); + const InstructionBenchmark *RelevantPoint = I.value(); + + assert(RelevantPoint->Measurements.size() == 1); + assert(RelevantPoint->Measurements[0].Key == "latency"); + SnippetLatency(row, 0) = RelevantPoint->Measurements[0].PerSnippetValue; + + if (RelevantPoint->Key.Instructions.size() == 1) { + // Give a(n arbitrary) bonus to the points that directly measured + // one single specific instruction. We believe these measurements + // to be precise, so the fit should basically not change them. + Weights(row, row) = 1e+3; + } + + for (const MCInst &Instruction : RelevantPoint->Key.Instructions) + OpcodeChaining(row, OpcodeToIndex[Instruction.getOpcode()]) = 1; + } + RelevantPoints.clear(); + OpcodeToIndex.clear(); + + auto EstimatedInstructionLatencies = getWeightedLeastSquaresEstimator( + OpcodeChaining, SnippetLatency, Weights); + + Points.erase( + std::remove_if(Points.begin(), Points.end(), IsChainedLatencyPoint), + Points.end()); + + Points.reserve(Points.size() + IndexToOpcode.size()); + for (auto I : enumerate(IndexToOpcode)) { + const MCInst &Instruction = I.value(); + const double EstimatedInstructionLatency = + EstimatedInstructionLatencies(I.index(), 0); + + Points.emplace_back(); + InstructionBenchmark &NewPoint = Points.back(); + + NewPoint.Key.Instructions.emplace_back(Instruction); + NewPoint.Mode = InstructionBenchmark::Latency; + NewPoint.CpuName = Points.front().CpuName; + NewPoint.LLVMTriple = Points.front().LLVMTriple; + NewPoint.Measurements.emplace_back( + BenchmarkMeasure::Create("latency", EstimatedInstructionLatency)); + NewPoint.Info = "OLS fit"; + } +} + +} // namespace exegesis +} // namespace llvm diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp --- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp +++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp @@ -18,6 +18,7 @@ #include "lib/Error.h" #include "lib/LlvmState.h" #include "lib/PerfHelper.h" +#include "lib/PostProcessing.h" #include "lib/SnippetFile.h" #include "lib/SnippetRepetitor.h" #include "lib/Target.h" @@ -409,7 +410,7 @@ // Read benchmarks. const LLVMState State(""); - const std::vector Points = ExitOnFileError( + std::vector Points = ExitOnFileError( BenchmarkFile, InstructionBenchmark::readYamls(State, BenchmarkFile)); outs() << "Parsed " << Points.size() << " benchmark points\n"; @@ -420,6 +421,7 @@ // FIXME: Check that all points have the same triple/cpu. // FIXME: Merge points from several runs (latency and uops). + std::string Error; const auto *TheTarget = TargetRegistry::lookupTarget(Points[0].LLVMTriple, Error); @@ -431,6 +433,8 @@ std::unique_ptr InstrInfo(TheTarget->createMCInstrInfo()); assert(InstrInfo && "Unable to create instruction info!"); + PostProcessChainedLatencyBenchmarkPoints(Points, InstrInfo->getNumOpcodes()); + const auto Clustering = ExitOnErr(InstructionBenchmarkClustering::create( Points, AnalysisClusteringAlgorithm, AnalysisDbscanNumPoints, AnalysisClusteringEpsilon, InstrInfo->getNumOpcodes())); diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt --- a/llvm/unittests/Support/CMakeLists.txt +++ b/llvm/unittests/Support/CMakeLists.txt @@ -46,6 +46,7 @@ JSONTest.cpp KnownBitsTest.cpp LEB128Test.cpp + LinearAlgebraTest.cpp LinearPolyBaseTest.cpp LineIteratorTest.cpp LockFileManagerTest.cpp @@ -99,6 +100,8 @@ xxhashTest.cpp ) +set_source_files_properties(LinearAlgebraTest.cpp PROPERTIES COMPILE_FLAGS -O0 -g3 -ggdb3) + target_link_libraries(SupportTests PRIVATE LLVMTestingSupport) # Disable all warning for AlignOfTest.cpp, diff --git a/llvm/unittests/Support/LinearAlgebraTest.cpp b/llvm/unittests/Support/LinearAlgebraTest.cpp new file mode 100644 --- /dev/null +++ b/llvm/unittests/Support/LinearAlgebraTest.cpp @@ -0,0 +1,206 @@ +//===- unittests/Support/LinearAlgebraTest.cpp - linear algebra tests -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/LinearAlgebra.h" +#include "gtest/gtest.h" +#include + +using namespace llvm; +using namespace linearalgebra; + +namespace { + +template +std::vector getAllValues(MatrixInterface &LHS) { + std::vector Elements; + for (int row = 0; row != LHS.getNumRows(); ++row) { + for (int col = 0; col != LHS.getNumColumns(); ++col) + Elements.emplace_back(LHS(row, col)); + } + return Elements; +} + +TEST(LinearAlgebraTest, Basic) { + Matrix M(2, 2); + EXPECT_EQ(std::vector({0, 0, 0, 0}), getAllValues(M)); + + M(0, 1) = 42; + EXPECT_EQ(std::vector({0, 42, 0, 0}), getAllValues(M)); + + auto TM = M.getTransposedMatrix(); + EXPECT_EQ(std::vector({0, 0, 42, 0}), getAllValues(TM)); + EXPECT_EQ(std::vector({0, 42, 0, 0}), getAllValues(M)); + + auto TTM = TM.getTransposedMatrix(); + EXPECT_EQ(std::vector({0, 42, 0, 0}), getAllValues(TTM)); + + M(1, 1) = 22; + EXPECT_EQ(std::vector({0, 0, 42, 22}), getAllValues(TM)); + EXPECT_EQ(std::vector({0, 42, 0, 22}), getAllValues(M)); + + auto MM = getAugmentedMatrix(M, M); + EXPECT_EQ(std::vector({0, 42, 0, 42, 0, 22, 0, 22}), getAllValues(MM)); + MM(0, 0) = -1; + MM(0, 2) = -1; + + AugmentedMatrix MTM(M, TM); + EXPECT_EQ(std::vector({-1, 42, -1, 0, 0, 22, 42, 22}), + getAllValues(MTM)); + + Matrix M2(2, 1); + M2(0, 0) = 3; + M2(1, 0) = 6; + + auto MM2 = getAugmentedMatrix(M, M2); + EXPECT_EQ(std::vector({-1, 42, 3, 0, 22, 6}), getAllValues(MM2)); + + auto M2M = getAugmentedMatrix(M2, M); + EXPECT_EQ(std::vector({3, -1, 42, 6, 0, 22}), getAllValues(M2M)); +} + +TEST(LinearAlgebraTest, GaussJordanElimination) { + Matrix M(2, 2); + M(0, 0) = 1; + M(0, 1) = 2; + M(1, 0) = 3; + M(1, 1) = 4; + + performGaussJordanElimination(M); + EXPECT_EQ(std::vector({1, 0, 0, 1}), getAllValues(M)); + + Matrix M2(3, 3); + M2(0, 0) = 4; + M2(0, 1) = 1; + M2(0, 2) = 9; + M2(1, 0) = 2; + M2(1, 1) = -1; + M2(1, 2) = 3; + M2(2, 0) = 5; + M2(2, 1) = -3; + M2(2, 2) = 7; + + performGaussJordanElimination(M2); + EXPECT_EQ(std::vector({1, 0, 0, 0, 1, 0, 0, 0, 1}), getAllValues(M2)); + + Matrix M3(3, 4); + M3(0, 0) = 2; + M3(0, 1) = 1; + M3(0, 2) = -1; + M3(0, 3) = 8; + + M3(1, 0) = -3; + M3(1, 1) = -1; + M3(1, 2) = 2; + M3(1, 3) = -11; + + M3(2, 0) = -2; + M3(2, 1) = 1; + M3(2, 2) = 2; + M3(2, 3) = -3; + + performGaussJordanElimination(M3); + EXPECT_EQ(std::vector({1, 0, 0, 2, 0, 1, 0, 3, 0, 0, 1, -1}), + getAllValues(M3)); +} + +TEST(LinearAlgebraTest, InverseMatrix) { + Matrix M(3, 3); + M(0, 0) = 2; + M(0, 1) = -1; + M(0, 2) = 0; + + M(1, 0) = -1; + M(1, 1) = 2; + M(1, 2) = -1; + + M(2, 0) = 0; + M(2, 1) = -1; + M(2, 2) = 2; + + auto IM = getInverseMatrix(M); + EXPECT_EQ(std::vector({3. / 4, 1. / 2, 1. / 4, 1. / 2, 1, 1. / 2, + 1. / 4, 1. / 2, 3. / 4}), + getAllValues(IM)); +} + +TEST(LinearAlgebraTest, NormalMatrix) { + Matrix M(3, 2); + M(0, 0) = 0; + M(0, 1) = 1; + + M(1, 0) = 1; + M(1, 1) = 1; + + M(2, 0) = 2; + M(2, 1) = 1; + + auto NM = getNormalMatrix(M); + EXPECT_EQ(std::vector({5, 3, 3, 3}), getAllValues(NM)); +} + +TEST(LinearAlgebraTest, MomentMatrix) { + Matrix M(3, 2); + M(0, 0) = 0; + M(0, 1) = 1; + + M(1, 0) = 1; + M(1, 1) = 1; + + M(2, 0) = 2; + M(2, 1) = 1; + + Matrix y(3, 1); + y(0, 0) = 6; + y(1, 0) = 0; + y(2, 0) = 0; + + auto MM = getMomentMatrix(M, y); + EXPECT_EQ(std::vector({0, 6}), getAllValues(MM)); +} + +TEST(LinearAlgebraTest, OLS) { + Matrix M(2, 2); + M(0, 0) = 1; + M(0, 1) = 2; + + M(1, 0) = 1; + M(1, 1) = 4; + + Matrix y(2, 1); + y(0, 0) = 5; + y(1, 0) = 6; + + auto beta = getOrdinaryLeastSquaresEstimator(M, y); + EXPECT_EQ(std::vector({4, 1. / 2}), getAllValues(beta)); +} + +TEST(LinearAlgebraTest, OLSOverdefined) { + Matrix M(4, 2); + M(0, 0) = 1; + M(0, 1) = 1; + + M(1, 0) = 1; + M(1, 1) = 2; + + M(2, 0) = 1; + M(2, 1) = 3; + + M(3, 0) = 1; + M(3, 1) = 4; + + Matrix y(4, 1); + y(0, 0) = 6; + y(1, 0) = 5; + y(2, 0) = 7; + y(3, 0) = 10; + + auto beta = getOrdinaryLeastSquaresEstimator(M, y); + EXPECT_EQ(std::vector({7. / 2, 7. / 5}), getAllValues(beta)); +} + +} // namespace