diff --git a/llvm/include/llvm/Support/LinearAlgebra.h b/llvm/include/llvm/Support/LinearAlgebra.h
new file mode 100644
--- /dev/null
+++ b/llvm/include/llvm/Support/LinearAlgebra.h
@@ -0,0 +1,444 @@
+//===-- llvm/Support/LinearAlgebra.h - Matrices and operations --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains some functions that are useful for math stuff.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_LINEARALGEBRA_H
+#define LLVM_SUPPORT_LINEARALGEBRA_H
+
+#include <algorithm>
+#include <cassert>
+#include <limits>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+namespace llvm {
+namespace linearalgebra {
+
+///--------------------------------------------------------------------------///
+
+template <typename, typename> class TransposedMatrix;
+
+template <typename T, typename Derived> struct MatrixInterface {
+  using value_type = T;
+
+  //  MatrixInterface() = delete;
+  MatrixInterface &operator=(MatrixInterface) = delete;
+  MatrixInterface &operator=(const MatrixInterface &) = delete;
+
+  int getNumRows() const { return ((const Derived *)this)->getNumRows(); }
+  int getNumColumns() const { return ((const Derived *)this)->getNumColumns(); }
+
+  T &operator()(int row, int col) {
+    return ((Derived *)this)->operator()(row, col);
+  }
+
+  /// X^T
+  TransposedMatrix<T, Derived> getTransposedMatrix() {
+    return TransposedMatrix<T, Derived>(*((Derived *)this));
+  }
+};
+
+///--------------------------------------------------------------------------///
+
+template <typename T = double>
+class Matrix : public MatrixInterface<T, Matrix<T>> {
+public:
+  Matrix(int num_rows, int num_cols) : num_rows(num_rows), num_cols(num_cols) {
+    storage.resize(num_rows * num_cols);
+  }
+
+  // Disallow copying.
+  //  Matrix(const Matrix &) = delete;
+  //  Matrix(Matrix &&) = delete;
+  //  Matrix &operator=(Matrix) = delete;
+  //  Matrix &operator=(const Matrix &) = delete;
+  //  Matrix &operator=(Matrix &&) = delete;
+
+  int getNumRows() const { return num_rows; };
+  int getNumColumns() const { return num_cols; };
+
+  T &operator()(int row, int col) {
+    assert(row >= 0 && row < num_rows);
+    assert(col >= 0 && col < num_cols);
+
+    return storage[num_cols * row + col];
+  }
+
+private:
+  std::vector<T> storage;
+  int num_rows;
+  int num_cols;
+};
+
+template <typename T, typename InnerTy>
+class TransposedMatrix
+    : public MatrixInterface<T, TransposedMatrix<T, InnerTy>> {
+  InnerTy &m;
+
+public:
+  TransposedMatrix(InnerTy &m) : m(m) {}
+
+  int getNumRows() const { return m.getNumColumns(); };
+  int getNumColumns() const { return m.getNumRows(); };
+
+  T &operator()(int row, int col) { return m(col, row); }
+};
+
+template <typename T, typename LHSTy, typename RHSTy>
+class AugmentedMatrix
+    : public MatrixInterface<T, AugmentedMatrix<T, LHSTy, RHSTy>> {
+  MatrixInterface<T, LHSTy> &a;
+  MatrixInterface<T, RHSTy> &b;
+
+public:
+  AugmentedMatrix(MatrixInterface<T, LHSTy> &a, MatrixInterface<T, RHSTy> &b)
+      : a(a), b(b) {
+    assert(a.getNumRows() == b.getNumRows());
+  }
+
+  int getNumRows() const { return a.getNumRows(); };
+  int getNumColumns() const { return a.getNumColumns() + b.getNumColumns(); };
+
+  T &operator()(int row, int col) {
+    assert(col >= 0 && col < getNumColumns());
+    if (col < a.getNumColumns())
+      return a(row, col);
+    return b(row, col - a.getNumColumns());
+  }
+};
+
+template <typename T, typename LHSTy, typename RHSTy>
+AugmentedMatrix<T, LHSTy, RHSTy>
+getAugmentedMatrix(MatrixInterface<T, LHSTy> &a, MatrixInterface<T, RHSTy> &b) {
+  return AugmentedMatrix<T, LHSTy, RHSTy>(a, b);
+}
+
+///--------------------------------------------------------------------------///
+
+template <typename T, typename LHSTy>
+Matrix<T> cloneMatrix(MatrixInterface<T, LHSTy> &LHS) {
+  Matrix<T> m(LHS.getNumRows(), LHS.getNumColumns());
+  for (int row = 0; row != LHS.getNumRows(); ++row) {
+    for (int col = 0; col != LHS.getNumColumns(); ++col) {
+      m(row, col) = LHS(row, col);
+    }
+  }
+  return m;
+}
+
+///--------------------------------------------------------------------------///
+
+/// Get a square matrix with all elements being zero except the elements
+/// on the diagonal, which are ones.
+template <typename T, typename LHSTy>
+bool isSquareMatrix(const MatrixInterface<T, LHSTy> &LHS) {
+  return LHS.getNumRows() == LHS.getNumColumns();
+}
+
+/// Are all the entries below the main diagonal are zero?
+template <typename T, typename LHSTy>
+bool isUpperTriangularMatrix(MatrixInterface<T, LHSTy> &LHS) {
+  for (int row = 0; row != LHS.getNumRows(); ++row) {
+    for (int col = 0; col != LHS.getNumColumns(); ++col) {
+      // Is this element on the diagonal, or to the right of diagonal?
+      if (col >= row)
+        continue;
+      if (LHS(row, col) != 0)
+        return false;
+    }
+  }
+  return true;
+}
+
+///--------------------------------------------------------------------------///
+
+/// Get a square matrix with all elements being zero except the elements
+/// on the diagonal, which are ones.
+template <typename T> Matrix<T> getIdentityMatrix(int Size) {
+  Matrix<T> m(Size, Size);
+  for (int diagEltIdx = 0; diagEltIdx != Size; ++diagEltIdx)
+    m(diagEltIdx, diagEltIdx) = 1;
+  return m;
+}
+
+/// Are all elements of this matrix, except the ones on the diagonal, a zeros?
+template <typename T, typename LHSTy>
+bool isDiagonalMatrix(MatrixInterface<T, LHSTy> &LHS) {
+  if (!isSquareMatrix(LHS))
+    return false;
+  for (int row = 0; row != LHS.getNumRows(); ++row) {
+    for (int col = 0; col != LHS.getNumColumns(); ++col) {
+      if (col == row)
+        continue;
+      if (LHS(row, col) != 0)
+        return false;
+    }
+  }
+  return true;
+}
+
+/// Are all elements of this matrix zeros, except the elements on the main
+/// diagonal, which are ones?
+template <typename T, typename LHSTy>
+bool isIdentityMatrix(MatrixInterface<T, LHSTy> &LHS) {
+  if (!isDiagonalMatrix(LHS))
+    return false;
+  for (int row = 0; row != LHS.getNumRows(); ++row) {
+    for (int col = 0; col != LHS.getNumColumns(); ++col) {
+      if (LHS(row, col) != (col == row) ? 1 : 0)
+        return false;
+    }
+  }
+  return true;
+}
+
+///--------------------------------------------------------------------------///
+
+/// On row \p row, which column is the first one to have a non-zero value?
+/// Returns number of columns if no such element exists.
+template <typename T, typename LHSTy>
+int getLeadingCoeffientColumn(MatrixInterface<T, LHSTy> &LHS, int row) {
+  int col = 0;
+  for (; col != LHS.getNumColumns(); ++col) {
+    if (LHS(row, col) != 0)
+      break;
+  }
+  return col;
+}
+
+/// Are all elements of the row \p row zeros?
+template <typename T, typename LHSTy>
+int isZeroRow(MatrixInterface<T, LHSTy> &LHS, int row) {
+  return getLeadingCoeffientColumn(LHS, row) == LHS.getNumColumns();
+}
+
+template <typename T, typename LHSTy>
+bool isMatrixInRowEchelonForm(MatrixInterface<T, LHSTy> &LHS) {
+  assert(isUpperTriangularMatrix(LHS));
+
+  bool seenZeroRow = false;
+  for (int row = 0; row != LHS.getNumRows(); ++row) {
+    bool IsZeroRow = isZeroRow(LHS, row);
+    seenZeroRow |= IsZeroRow;
+    if (seenZeroRow) {
+      if (!IsZeroRow)
+        return false;
+      continue;
+    }
+    if (row == 0)
+      continue;
+    if (getLeadingCoeffientColumn(LHS, row) <=
+        getLeadingCoeffientColumn(LHS, row - 1))
+      return false;
+  }
+  return true;
+}
+
+template <typename T, typename LHSTy>
+bool isMatrixInReducedRowEchelonForm(MatrixInterface<T, LHSTy> &LHS) {
+  if (!isMatrixInRowEchelonForm(LHS))
+    return false;
+
+  for (int pivot = 0; pivot != LHS.getNumRows(); ++pivot) {
+    if (isZeroRow(LHS, pivot))
+      break;
+    int leadingCoeffientColumn = getLeadingCoeffientColumn(LHS, pivot);
+    for (int row = 0; row != LHS.getNumRows(); ++row) {
+      if (LHS(row, leadingCoeffientColumn) != (row == pivot) ? 1 : 0)
+        return false;
+    }
+  }
+  return true;
+}
+
+///--------------------------------------------------------------------------///
+
+// Elementary row operations
+template <typename T, typename LHSTy>
+void swapRows(MatrixInterface<T, LHSTy> &LHS, int rowA, int rowB) {
+  assert(rowA != rowB);
+  for (int col = 0; col != LHS.getNumColumns(); ++col)
+    std::swap(LHS(rowA, col), LHS(rowB, col));
+}
+template <typename T, typename LHSTy>
+void multiplyRow(MatrixInterface<T, LHSTy> &LHS, int row, T scale) {
+  assert(scale != 0);
+  for (int col = 0; col != LHS.getNumColumns(); ++col)
+    LHS(row, col) *= scale;
+}
+template <typename T, typename LHSTy>
+void divideRow(MatrixInterface<T, LHSTy> &LHS, int row, T scale) {
+  assert(scale != 0);
+  for (int col = 0; col != LHS.getNumColumns(); ++col)
+    LHS(row, col) /= scale;
+}
+template <typename T, typename LHSTy>
+void addRowMultiple(MatrixInterface<T, LHSTy> &LHS, int dstRow, int srcRow,
+                    T scale) {
+  assert(dstRow != srcRow);
+  assert(scale != 0);
+  for (int col = 0; col != LHS.getNumColumns(); ++col)
+    LHS(dstRow, col) += scale * LHS(srcRow, col);
+}
+template <typename T, typename LHSTy>
+void subtractRowMultiple(MatrixInterface<T, LHSTy> &LHS, int dstRow, int srcRow,
+                         T scale) {
+  assert(dstRow != srcRow);
+  assert(scale != 0);
+  for (int col = 0; col != LHS.getNumColumns(); ++col)
+    LHS(dstRow, col) -= scale * LHS(srcRow, col);
+}
+
+///--------------------------------------------------------------------------///
+
+template <typename T, typename LHSTy>
+int findPivotingRow(MatrixInterface<T, LHSTy> &LHS, int column, int firstRow) {
+  int pivotRow = firstRow;
+  T pivotValMag = std::abs(LHS(firstRow, column));
+  for (int row = firstRow + 1; row != LHS.getNumRows(); ++row) {
+    T currValMag = std::abs(LHS(row, column));
+    if (currValMag < pivotValMag)
+      continue;
+    pivotRow = row;
+    pivotValMag = currValMag;
+  }
+  return pivotRow;
+}
+
+template <typename T, typename LHSTy>
+void performGaussJordanElimination(MatrixInterface<T, LHSTy> &LHS) {
+  assert(LHS.getNumRows() <= LHS.getNumColumns());
+  for (int pivot = 0; pivot != LHS.getNumRows(); ++pivot) {
+    int column = pivot;
+    int pivotRow = findPivotingRow(LHS, column, pivot);
+
+    if (pivotRow != pivot) {
+      swapRows(LHS, pivotRow, pivot);
+      pivotRow = pivot;
+    }
+
+    T &pivotElement = LHS(pivotRow, column);
+    assert(pivotElement != 0);
+
+    divideRow(LHS, pivotRow, pivotElement);
+    pivotElement = 1.0; // Account for floating point rounding issues.
+
+    for (int row = 0; row != LHS.getNumRows(); ++row) {
+      if (row == pivotRow)
+        continue;
+
+      T &currElement = LHS(row, column);
+      if (currElement == 0)
+        continue;
+
+      subtractRowMultiple(LHS, row, pivotRow, currElement);
+      currElement = 0; // Account for floating point rounding issues.
+    }
+  }
+  assert(isMatrixInReducedRowEchelonForm(LHS));
+}
+
+/// X^-1
+template <typename T, typename LHSTy>
+Matrix<T> getInverseMatrix(MatrixInterface<T, LHSTy> &LHS) {
+  assert(isSquareMatrix(LHS));
+  Matrix<T> A = cloneMatrix<T>(LHS);
+  Matrix<T> B = getIdentityMatrix<T>(LHS.getNumRows());
+  auto AB = getAugmentedMatrix(A, B);
+  performGaussJordanElimination(AB);
+  assert(isIdentityMatrix(A));
+  return B;
+}
+
+///--------------------------------------------------------------------------///
+
+/// X * Y
+template <typename T, typename LHSTy, typename RHSTy>
+Matrix<T> operator*(MatrixInterface<T, LHSTy> &LHS,
+                    MatrixInterface<T, RHSTy> &RHS) {
+  assert(LHS.getNumColumns() == RHS.getNumRows());
+
+  Matrix<T> R(LHS.getNumRows(), RHS.getNumColumns());
+  for (int row = 0; row != R.getNumRows(); ++row) {
+    for (int col = 0; col != R.getNumColumns(); ++col) {
+      for (int k = 0; k != LHS.getNumColumns(); ++k) {
+        R(row, col) += LHS(row, k) * RHS(k, col);
+      }
+    }
+  }
+
+  return R;
+}
+
+///--------------------------------------------------------------------------///
+
+/// X^T * X
+template <typename T, typename LHSTy>
+Matrix<T> getNormalMatrix(MatrixInterface<T, LHSTy> &LHS) {
+  auto LHSTransposed = LHS.getTransposedMatrix();
+  return LHSTransposed * LHS;
+}
+
+/// X^T * y
+template <typename T, typename LHSTy, typename RHSTy>
+Matrix<T> getMomentMatrix(MatrixInterface<T, LHSTy> &LHS,
+                          MatrixInterface<T, RHSTy> &RHS) {
+  assert(LHS.getNumRows() == RHS.getNumRows());
+  assert(RHS.getNumColumns() == 1);
+
+  auto LHSTransposed = LHS.getTransposedMatrix();
+  return LHSTransposed * RHS;
+}
+
+///--------------------------------------------------------------------------///
+
+/// (X^T * X)^-1 * X^T * y
+template <typename T, typename LHSTy, typename RHSTy>
+Matrix<T> getOrdinaryLeastSquaresEstimator(MatrixInterface<T, LHSTy> &XMat,
+                                           MatrixInterface<T, RHSTy> &YVec) {
+  assert(XMat.getNumRows() >= XMat.getNumColumns());
+  assert(XMat.getNumRows() == YVec.getNumRows());
+  assert(YVec.getNumColumns() == 1);
+
+  auto XMatTransposed = XMat.getTransposedMatrix();
+  Matrix<T> XMatNormal = XMatTransposed * XMat;
+  Matrix<T> XMatNormalInverse = getInverseMatrix<T>(XMatNormal);
+  Matrix<T> XYMoment = XMatTransposed * YVec;
+  return XMatNormalInverse * XYMoment;
+}
+
+/// (X^T * W * X)^-1 * X^T * W * y
+template <typename T, typename LHSTy, typename RHSTy>
+Matrix<T> getWeightedLeastSquaresEstimator(MatrixInterface<T, LHSTy> &XMat,
+                                           MatrixInterface<T, RHSTy> &YVec,
+                                           MatrixInterface<T, RHSTy> &WMat) {
+  assert(XMat.getNumRows() >= XMat.getNumColumns());
+  assert(isDiagonalMatrix(WMat));
+  assert(XMat.getNumRows() == WMat.getNumRows());
+  assert(XMat.getNumRows() == YVec.getNumRows());
+  assert(YVec.getNumColumns() == 1);
+
+  auto XMatTransposed = XMat.getTransposedMatrix();
+  Matrix<T> XMatTransposedWeighted = XMatTransposed * WMat;
+  Matrix<T> XMatTransposedWeightedNormal = XMatTransposedWeighted * XMat;
+  Matrix<T> XMatTransposedWeightedNormalInverse =
+      getInverseMatrix<T>(XMatTransposedWeightedNormal);
+  Matrix<T> XYMomentWeighted = XMatTransposedWeighted * YVec;
+  return XMatTransposedWeightedNormalInverse * XYMomentWeighted;
+}
+
+///--------------------------------------------------------------------------///
+
+} // namespace linearalgebra
+} // namespace llvm
+
+#endif
diff --git a/llvm/test/tools/llvm-exegesis/X86/analysis-latency-instruction-chaining-domain-transfer.test b/llvm/test/tools/llvm-exegesis/X86/analysis-latency-instruction-chaining-domain-transfer.test
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-exegesis/X86/analysis-latency-instruction-chaining-domain-transfer.test
@@ -0,0 +1,52 @@
+# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-clusters-output-file=- -analysis-clustering-epsilon=0.5 -analysis-inconsistency-epsilon=0.5 -analysis-numpoints=1 | FileCheck -check-prefixes=CHECK %s
+
+# CHECK: {{^}}cluster_id,opcode_name,config,sched_class,latency{{$}}
+
+# CHECK-NEXT: {{^}}0,
+# CHECK-SAME: ,10.08{{$}}
+
+# CHECK: {{^}}1,
+# CHECK-SAME: ,11.07{{$}}
+
+# PINSRBrr has latency of 2 cycles. (the value from scheduling profile!)
+# But int to fpu units data transfer causes additional latency of 10 cycles.
+# Thus the actual latency of VPEXTRBrr is 10..11.
+
+---
+mode:            latency
+key:
+  instructions:
+    - 'VPEXTRBrr R15D XMM3 i_0x1'
+    - 'PINSRBrr XMM3 XMM3 R15D i_0x1'
+  config:          ''
+  register_initial_values:
+    - 'XMM3=0x0'
+cpu_name:        bdver2
+llvm_triple:     x86_64-unknown-linux-gnu
+num_repetitions: 10000
+measurements:
+  - { key: latency, value: 0.0000, per_snippet_value: 22.0802 }
+error:           ''
+info:            Repeating two instructions
+assembled_snippet: 41574883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F1C244883C410C4C37914DF0166410F3A20DF01C4C37914DF0166410F3A20DF01C4C37914DF0166410F3A20DF01C4C37914DF0166410F3A20DF01C4C37914DF0166410F3A20DF01C4C37914DF0166410F3A20DF01C4C37914DF0166410F3A20DF01C4C37914DF0166410F3A20DF01415FC3
+...
+---
+mode:            latency
+key:
+  instructions:
+    - 'PEXTRBrr ESI XMM7 i_0x1'
+    - 'VCVTSI642SSrr XMM7 XMM12 RSI'
+  config:          ''
+  register_initial_values:
+    - 'XMM7=0x0'
+    - 'XMM12=0x0'
+    - 'RSI=0x0'
+cpu_name:        bdver2
+llvm_triple:     x86_64-unknown-linux-gnu
+num_repetitions: 10000
+measurements:
+  - { key: latency, value: 12.533, per_snippet_value: 25.066 }
+error:           ''
+info:            Repeating two instructions
+assembled_snippet: 4883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F3C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F24244883C41048BE0000000000000000660F3A14FE01C4E19A2AFE660F3A14FE01C4E19A2AFE660F3A14FE01C4E19A2AFE660F3A14FE01C4E19A2AFE660F3A14FE01C4E19A2AFE660F3A14FE01C4E19A2AFE660F3A14FE01C4E19A2AFE660F3A14FE01C4E19A2AFEC3
+...
diff --git a/llvm/test/tools/llvm-exegesis/X86/analysis-latency-instruction-chaining.test b/llvm/test/tools/llvm-exegesis/X86/analysis-latency-instruction-chaining.test
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-exegesis/X86/analysis-latency-instruction-chaining.test
@@ -0,0 +1,57 @@
+# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-clusters-output-file=- -analysis-clustering-epsilon=0.5 -analysis-inconsistency-epsilon=0.5 -analysis-numpoints=1 | FileCheck -check-prefixes=CHECK %s
+
+# CHECK: {{^}}cluster_id,opcode_name,config,sched_class,latency{{$}}
+
+# CHECK-NEXT: {{^}}0,
+# CHECK-SAME: ,1.00{{$}}
+# CHECK-NEXT: {{^}}0,
+# CHECK-SAME: ,1.00{{$}}
+
+# Instructions were executed serially, meaning that the next instruction
+# *ONLY* starts executing when the current instruction finishes.
+# Thus, the real latency of the first instruction is the per_snippet_value minus
+# the sum of latencies of all the other instructions in the snippet.
+
+# RCR8rCL has latency of 11. (the value from scheduling profile!)
+# Latency of whole snipped is 12 or 23. (not measured, hand-written.)
+# Thus, latency of BT32rr is 12-11 = 1, or 23-11-11 = 1
+
+---
+mode:            latency
+key:
+  instructions:
+    - 'BT32rr R11D R11D'
+    - 'RCR8rCL R11B R11B'
+  config:          ''
+  register_initial_values:
+    - 'R11D=0x0'
+    - 'R11B=0x0'
+    - 'CL=0x0'
+cpu_name:        bdver2
+llvm_triple:     x86_64-unknown-linux-gnu
+num_repetitions: 10000
+measurements:
+  - { key: latency, value: 50.0000, per_snippet_value: 100.0000 }
+error:           ''
+info:            Repeating two instructions
+assembled_snippet: 41BB0000000041B300B100450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DBC3
+...
+---
+mode:            latency
+key:
+  instructions:
+    - 'RCR8rCL R11B R11B'
+  config:          ''
+  register_initial_values:
+    - 'R11D=0x0'
+    - 'R11B=0x0'
+    - 'CL=0x0'
+cpu_name:        bdver2
+llvm_triple:     x86_64-unknown-linux-gnu
+num_repetitions: 10000
+measurements:
+  - { key: latency, value: 25.0000, per_snippet_value: 25.0000 }
+error:           ''
+info:            Repeating two instructions
+assembled_snippet: 41BB0000000041B300B100450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DBC3
+...
diff --git a/llvm/tools/llvm-exegesis/lib/Analysis.cpp b/llvm/tools/llvm-exegesis/lib/Analysis.cpp
--- a/llvm/tools/llvm-exegesis/lib/Analysis.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Analysis.cpp
@@ -289,6 +289,8 @@
   default:
     llvm_unreachable("invalid mode");
   }
+  if (Point.Info == "OLS fit")
+    OS << " <small><i>(" << Point.Info << ")</i></small>";
   OS << "</span> <span class=\"mono\">";
   writeEscaped<kEscapeHtml>(OS, Point.Key.Config);
   OS << "</span></li>";
diff --git a/llvm/tools/llvm-exegesis/lib/CMakeLists.txt b/llvm/tools/llvm-exegesis/lib/CMakeLists.txt
--- a/llvm/tools/llvm-exegesis/lib/CMakeLists.txt
+++ b/llvm/tools/llvm-exegesis/lib/CMakeLists.txt
@@ -55,6 +55,7 @@
   MCInstrDescView.cpp
   ParallelSnippetGenerator.cpp
   PerfHelper.cpp
+  PostProcessing.cpp
   RegisterAliasing.cpp
   RegisterValue.cpp
   SchedClassResolution.cpp
diff --git a/llvm/tools/llvm-exegesis/lib/PostProcessing.h b/llvm/tools/llvm-exegesis/lib/PostProcessing.h
new file mode 100644
--- /dev/null
+++ b/llvm/tools/llvm-exegesis/lib/PostProcessing.h
@@ -0,0 +1,29 @@
+//===-- PostProcessing.h ----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Post-processing for the benchmark points.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_EXEGESIS_POSTPROCESSING_H
+#define LLVM_TOOLS_LLVM_EXEGESIS_POSTPROCESSING_H
+
+#include "BenchmarkResult.h"
+#include <vector>
+
+namespace llvm {
+namespace exegesis {
+
+void PostProcessChainedLatencyBenchmarkPoints(
+    std::vector<InstructionBenchmark> &Points, unsigned NumOpcodes);
+
+} // namespace exegesis
+} // namespace llvm
+
+#endif // LLVM_TOOLS_LLVM_EXEGESIS_POSTPROCESSING_H
diff --git a/llvm/tools/llvm-exegesis/lib/PostProcessing.cpp b/llvm/tools/llvm-exegesis/lib/PostProcessing.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/tools/llvm-exegesis/lib/PostProcessing.cpp
@@ -0,0 +1,121 @@
+//===-- PostProcessing.cpp --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "PostProcessing.h"
+#include "Clustering.h"
+#include "SchedClassResolution.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Support/LinearAlgebra.h"
+#include <utility>
+
+namespace llvm {
+namespace exegesis {
+
+void PostProcessChainedLatencyBenchmarkPoints(
+    std::vector<InstructionBenchmark> &Points, unsigned NumOpcodes) {
+  auto IsLatencyPoint = [](const InstructionBenchmark &Point) {
+    return Point.Mode == InstructionBenchmark::ModeE::Latency &&
+           Point.Error.empty();
+  };
+  auto IsChainedLatencyPoint =
+      [IsLatencyPoint](const InstructionBenchmark &Point) {
+        return IsLatencyPoint(Point) && Point.Key.Instructions.size() >= 2;
+      };
+
+  // Which opcodes were ever chained in all of the benchmark points?
+  // Only record the ones for which we succeeded in measuring latency.
+  std::vector<int> OpcodeToIndex(NumOpcodes, /*Index=*/-1);
+  std::vector<MCInst> IndexToOpcode;
+  IndexToOpcode.reserve(NumOpcodes);
+  for (const InstructionBenchmark &Point :
+       make_filter_range(Points, IsChainedLatencyPoint)) {
+    for (const MCInst &Instruction : Point.Key.Instructions) {
+      const unsigned Opcode = Instruction.getOpcode();
+      assert(Opcode < NumOpcodes && "NumOpcodes is incorrect (too small)");
+      if (OpcodeToIndex[Opcode] != -1) // Already seen chained?
+        continue;
+      OpcodeToIndex[Opcode] = IndexToOpcode.size();
+      IndexToOpcode.emplace_back(Instruction);
+    }
+  }
+
+  if (IndexToOpcode.empty())
+    return; // Lucky us.
+
+  // Remember all the points that contained any opcode that was ever chained.
+  // We can not do this in the previous loop, because if Opc0 was chained with
+  // Opc1, we'll miss all standalone points for Opc1 before seeing the chaining.
+  // We store indexes into Points to avoid iterator invalidation.
+  SmallVector<const InstructionBenchmark *, 64> RelevantPoints;
+  for (const InstructionBenchmark &Point :
+       make_filter_range(Points, IsLatencyPoint)) {
+    if (any_of(Point.Key.Instructions,
+               [OpcodeToIndex](const MCInst &Instruction) {
+                 return OpcodeToIndex[Instruction.getOpcode()] != -1;
+               }))
+      RelevantPoints.emplace_back(&Point);
+  }
+
+  using namespace linearalgebra;
+
+  errs() << "rows = " << RelevantPoints.size() << "\n";
+  errs() << "cols = " << IndexToOpcode.size() << "\n";
+  Matrix<double> OpcodeChaining(RelevantPoints.size(), IndexToOpcode.size());
+  Matrix<double> SnippetLatency(RelevantPoints.size(), 1);
+  Matrix<double> Weights = getIdentityMatrix<double>(RelevantPoints.size());
+
+  for (auto I : enumerate(RelevantPoints)) {
+    int row = I.index();
+    const InstructionBenchmark *RelevantPoint = I.value();
+
+    assert(RelevantPoint->Measurements.size() == 1);
+    assert(RelevantPoint->Measurements[0].Key == "latency");
+    SnippetLatency(row, 0) = RelevantPoint->Measurements[0].PerSnippetValue;
+
+    if (RelevantPoint->Key.Instructions.size() == 1) {
+      // Give a(n arbitrary) bonus to the points that directly measured
+      // one single specific instruction. We believe these measurements
+      // to be precise, so the fit should basically not change them.
+      Weights(row, row) = 1e+3;
+    }
+
+    for (const MCInst &Instruction : RelevantPoint->Key.Instructions)
+      OpcodeChaining(row, OpcodeToIndex[Instruction.getOpcode()]) = 1;
+  }
+  RelevantPoints.clear();
+  OpcodeToIndex.clear();
+
+  auto EstimatedInstructionLatencies = getWeightedLeastSquaresEstimator<double>(
+      OpcodeChaining, SnippetLatency, Weights);
+
+  Points.erase(
+      std::remove_if(Points.begin(), Points.end(), IsChainedLatencyPoint),
+      Points.end());
+
+  Points.reserve(Points.size() + IndexToOpcode.size());
+  for (auto I : enumerate(IndexToOpcode)) {
+    const MCInst &Instruction = I.value();
+    const double EstimatedInstructionLatency =
+        EstimatedInstructionLatencies(I.index(), 0);
+
+    Points.emplace_back();
+    InstructionBenchmark &NewPoint = Points.back();
+
+    NewPoint.Key.Instructions.emplace_back(Instruction);
+    NewPoint.Mode = InstructionBenchmark::Latency;
+    NewPoint.CpuName = Points.front().CpuName;
+    NewPoint.LLVMTriple = Points.front().LLVMTriple;
+    NewPoint.Measurements.emplace_back(
+        BenchmarkMeasure::Create("latency", EstimatedInstructionLatency));
+    NewPoint.Info = "OLS fit";
+  }
+}
+
+} // namespace exegesis
+} // namespace llvm
diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
--- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
+++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
@@ -18,6 +18,7 @@
 #include "lib/Error.h"
 #include "lib/LlvmState.h"
 #include "lib/PerfHelper.h"
+#include "lib/PostProcessing.h"
 #include "lib/SnippetFile.h"
 #include "lib/SnippetRepetitor.h"
 #include "lib/Target.h"
@@ -409,7 +410,7 @@
 
   // Read benchmarks.
   const LLVMState State("");
-  const std::vector<InstructionBenchmark> Points = ExitOnFileError(
+  std::vector<InstructionBenchmark> Points = ExitOnFileError(
       BenchmarkFile, InstructionBenchmark::readYamls(State, BenchmarkFile));
 
   outs() << "Parsed " << Points.size() << " benchmark points\n";
@@ -420,6 +421,7 @@
   // FIXME: Check that all points have the same triple/cpu.
   // FIXME: Merge points from several runs (latency and uops).
 
+
   std::string Error;
   const auto *TheTarget =
       TargetRegistry::lookupTarget(Points[0].LLVMTriple, Error);
@@ -431,6 +433,8 @@
   std::unique_ptr<MCInstrInfo> InstrInfo(TheTarget->createMCInstrInfo());
   assert(InstrInfo && "Unable to create instruction info!");
 
+  PostProcessChainedLatencyBenchmarkPoints(Points, InstrInfo->getNumOpcodes());
+
   const auto Clustering = ExitOnErr(InstructionBenchmarkClustering::create(
       Points, AnalysisClusteringAlgorithm, AnalysisDbscanNumPoints,
       AnalysisClusteringEpsilon, InstrInfo->getNumOpcodes()));
diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt
--- a/llvm/unittests/Support/CMakeLists.txt
+++ b/llvm/unittests/Support/CMakeLists.txt
@@ -46,6 +46,7 @@
   JSONTest.cpp
   KnownBitsTest.cpp
   LEB128Test.cpp
+  LinearAlgebraTest.cpp
   LinearPolyBaseTest.cpp
   LineIteratorTest.cpp
   LockFileManagerTest.cpp
@@ -99,6 +100,8 @@
   xxhashTest.cpp
   )
 
+set_source_files_properties(LinearAlgebraTest.cpp PROPERTIES COMPILE_FLAGS -O0 -g3 -ggdb3)
+
 target_link_libraries(SupportTests PRIVATE LLVMTestingSupport)
 
 # Disable all warning for AlignOfTest.cpp,
diff --git a/llvm/unittests/Support/LinearAlgebraTest.cpp b/llvm/unittests/Support/LinearAlgebraTest.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/unittests/Support/LinearAlgebraTest.cpp
@@ -0,0 +1,206 @@
+//===- unittests/Support/LinearAlgebraTest.cpp - linear algebra tests -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/LinearAlgebra.h"
+#include "gtest/gtest.h"
+#include <initializer_list>
+
+using namespace llvm;
+using namespace linearalgebra;
+
+namespace {
+
+template <typename T, typename LHSTy>
+std::vector<T> getAllValues(MatrixInterface<T, LHSTy> &LHS) {
+  std::vector<T> Elements;
+  for (int row = 0; row != LHS.getNumRows(); ++row) {
+    for (int col = 0; col != LHS.getNumColumns(); ++col)
+      Elements.emplace_back(LHS(row, col));
+  }
+  return Elements;
+}
+
+TEST(LinearAlgebraTest, Basic) {
+  Matrix<int> M(2, 2);
+  EXPECT_EQ(std::vector<int>({0, 0, 0, 0}), getAllValues(M));
+
+  M(0, 1) = 42;
+  EXPECT_EQ(std::vector<int>({0, 42, 0, 0}), getAllValues(M));
+
+  auto TM = M.getTransposedMatrix();
+  EXPECT_EQ(std::vector<int>({0, 0, 42, 0}), getAllValues(TM));
+  EXPECT_EQ(std::vector<int>({0, 42, 0, 0}), getAllValues(M));
+
+  auto TTM = TM.getTransposedMatrix();
+  EXPECT_EQ(std::vector<int>({0, 42, 0, 0}), getAllValues(TTM));
+
+  M(1, 1) = 22;
+  EXPECT_EQ(std::vector<int>({0, 0, 42, 22}), getAllValues(TM));
+  EXPECT_EQ(std::vector<int>({0, 42, 0, 22}), getAllValues(M));
+
+  auto MM = getAugmentedMatrix(M, M);
+  EXPECT_EQ(std::vector<int>({0, 42, 0, 42, 0, 22, 0, 22}), getAllValues(MM));
+  MM(0, 0) = -1;
+  MM(0, 2) = -1;
+
+  AugmentedMatrix<int, decltype(M), decltype(TM)> MTM(M, TM);
+  EXPECT_EQ(std::vector<int>({-1, 42, -1, 0, 0, 22, 42, 22}),
+            getAllValues(MTM));
+
+  Matrix<int> M2(2, 1);
+  M2(0, 0) = 3;
+  M2(1, 0) = 6;
+
+  auto MM2 = getAugmentedMatrix(M, M2);
+  EXPECT_EQ(std::vector<int>({-1, 42, 3, 0, 22, 6}), getAllValues(MM2));
+
+  auto M2M = getAugmentedMatrix(M2, M);
+  EXPECT_EQ(std::vector<int>({3, -1, 42, 6, 0, 22}), getAllValues(M2M));
+}
+
+TEST(LinearAlgebraTest, GaussJordanElimination) {
+  Matrix<double> M(2, 2);
+  M(0, 0) = 1;
+  M(0, 1) = 2;
+  M(1, 0) = 3;
+  M(1, 1) = 4;
+
+  performGaussJordanElimination(M);
+  EXPECT_EQ(std::vector<double>({1, 0, 0, 1}), getAllValues(M));
+
+  Matrix<double> M2(3, 3);
+  M2(0, 0) = 4;
+  M2(0, 1) = 1;
+  M2(0, 2) = 9;
+  M2(1, 0) = 2;
+  M2(1, 1) = -1;
+  M2(1, 2) = 3;
+  M2(2, 0) = 5;
+  M2(2, 1) = -3;
+  M2(2, 2) = 7;
+
+  performGaussJordanElimination(M2);
+  EXPECT_EQ(std::vector<double>({1, 0, 0, 0, 1, 0, 0, 0, 1}), getAllValues(M2));
+
+  Matrix<double> M3(3, 4);
+  M3(0, 0) = 2;
+  M3(0, 1) = 1;
+  M3(0, 2) = -1;
+  M3(0, 3) = 8;
+
+  M3(1, 0) = -3;
+  M3(1, 1) = -1;
+  M3(1, 2) = 2;
+  M3(1, 3) = -11;
+
+  M3(2, 0) = -2;
+  M3(2, 1) = 1;
+  M3(2, 2) = 2;
+  M3(2, 3) = -3;
+
+  performGaussJordanElimination(M3);
+  EXPECT_EQ(std::vector<double>({1, 0, 0, 2, 0, 1, 0, 3, 0, 0, 1, -1}),
+            getAllValues(M3));
+}
+
+TEST(LinearAlgebraTest, InverseMatrix) {
+  Matrix<double> M(3, 3);
+  M(0, 0) = 2;
+  M(0, 1) = -1;
+  M(0, 2) = 0;
+
+  M(1, 0) = -1;
+  M(1, 1) = 2;
+  M(1, 2) = -1;
+
+  M(2, 0) = 0;
+  M(2, 1) = -1;
+  M(2, 2) = 2;
+
+  auto IM = getInverseMatrix(M);
+  EXPECT_EQ(std::vector<double>({3. / 4, 1. / 2, 1. / 4, 1. / 2, 1, 1. / 2,
+                                 1. / 4, 1. / 2, 3. / 4}),
+            getAllValues(IM));
+}
+
+TEST(LinearAlgebraTest, NormalMatrix) {
+  Matrix<double> M(3, 2);
+  M(0, 0) = 0;
+  M(0, 1) = 1;
+
+  M(1, 0) = 1;
+  M(1, 1) = 1;
+
+  M(2, 0) = 2;
+  M(2, 1) = 1;
+
+  auto NM = getNormalMatrix(M);
+  EXPECT_EQ(std::vector<double>({5, 3, 3, 3}), getAllValues(NM));
+}
+
+TEST(LinearAlgebraTest, MomentMatrix) {
+  Matrix<double> M(3, 2);
+  M(0, 0) = 0;
+  M(0, 1) = 1;
+
+  M(1, 0) = 1;
+  M(1, 1) = 1;
+
+  M(2, 0) = 2;
+  M(2, 1) = 1;
+
+  Matrix<double> y(3, 1);
+  y(0, 0) = 6;
+  y(1, 0) = 0;
+  y(2, 0) = 0;
+
+  auto MM = getMomentMatrix(M, y);
+  EXPECT_EQ(std::vector<double>({0, 6}), getAllValues(MM));
+}
+
+TEST(LinearAlgebraTest, OLS) {
+  Matrix<double> M(2, 2);
+  M(0, 0) = 1;
+  M(0, 1) = 2;
+
+  M(1, 0) = 1;
+  M(1, 1) = 4;
+
+  Matrix<double> y(2, 1);
+  y(0, 0) = 5;
+  y(1, 0) = 6;
+
+  auto beta = getOrdinaryLeastSquaresEstimator<double>(M, y);
+  EXPECT_EQ(std::vector<double>({4, 1. / 2}), getAllValues(beta));
+}
+
+TEST(LinearAlgebraTest, OLSOverdefined) {
+  Matrix<double> M(4, 2);
+  M(0, 0) = 1;
+  M(0, 1) = 1;
+
+  M(1, 0) = 1;
+  M(1, 1) = 2;
+
+  M(2, 0) = 1;
+  M(2, 1) = 3;
+
+  M(3, 0) = 1;
+  M(3, 1) = 4;
+
+  Matrix<double> y(4, 1);
+  y(0, 0) = 6;
+  y(1, 0) = 5;
+  y(2, 0) = 7;
+  y(3, 0) = 10;
+
+  auto beta = getOrdinaryLeastSquaresEstimator<double>(M, y);
+  EXPECT_EQ(std::vector<double>({7. / 2, 7. / 5}), getAllValues(beta));
+}
+
+} // namespace