diff --git a/llvm/include/llvm/Support/LinearAlgebra.h b/llvm/include/llvm/Support/LinearAlgebra.h
--- a/llvm/include/llvm/Support/LinearAlgebra.h
+++ b/llvm/include/llvm/Support/LinearAlgebra.h
@@ -170,11 +170,27 @@
   return m;
 }
 
+/// Are all elements of this matrix, except the ones on the diagonal, a zeros?
+template <typename T, typename LHSTy>
+bool isDiagonalMatrix(MatrixInterface<T, LHSTy> &LHS) {
+  if (!isSquareMatrix(LHS))
+    return false;
+  for (int row = 0; row != LHS.getNumRows(); ++row) {
+    for (int col = 0; col != LHS.getNumColumns(); ++col) {
+      if (col == row)
+        continue;
+      if (LHS(row, col) != 0)
+        return false;
+    }
+  }
+  return true;
+}
+
 /// Are all elements of this matrix zeros, except the elements on the main
 /// diagonal, which are ones?
 template <typename T, typename LHSTy>
 bool isIdentityMatrix(MatrixInterface<T, LHSTy> &LHS) {
-  if (!isSquareMatrix(LHS))
+  if (!isDiagonalMatrix(LHS))
     return false;
   for (int row = 0; row != LHS.getNumRows(); ++row) {
     for (int col = 0; col != LHS.getNumColumns(); ++col) {
@@ -311,8 +327,7 @@
     }
 
     T &pivotElement = LHS(pivotRow, column);
-    if (pivotElement == 0)
-      continue;
+    assert(pivotElement != 0);
 
     divideRow(LHS, pivotRow, pivotElement);
     pivotElement = 1.0; // Account for floating point rounding issues.
@@ -388,16 +403,37 @@
 
 /// (X^T * X)^-1 * X^T * y
 template <typename T, typename LHSTy, typename RHSTy>
-Matrix<T> getOrdinaryLeastSquaresEstimation(MatrixInterface<T, LHSTy> &LHS,
-                                            MatrixInterface<T, RHSTy> &RHS) {
-  assert(LHS.getNumRows() >= LHS.getNumColumns());
-  assert(LHS.getNumRows() == RHS.getNumRows());
-  assert(RHS.getNumColumns() == 1);
+Matrix<T> getOrdinaryLeastSquaresEstimator(MatrixInterface<T, LHSTy> &XMat,
+                                           MatrixInterface<T, RHSTy> &YVec) {
+  assert(XMat.getNumRows() >= XMat.getNumColumns());
+  assert(XMat.getNumRows() == YVec.getNumRows());
+  assert(YVec.getNumColumns() == 1);
 
-  auto LHSNormal = getNormalMatrix<T>(LHS);
-  auto LHSNormalInverse = getInverseMatrix<T>(LHSNormal);
-  auto LHSRHSMoment = getMomentMatrix<T>(LHS, RHS);
-  return LHSNormalInverse * LHSRHSMoment;
+  auto XMatTransposed = XMat.getTransposedMatrix();
+  Matrix<T> XMatNormal = XMatTransposed * XMat;
+  Matrix<T> XMatNormalInverse = getInverseMatrix<T>(XMatNormal);
+  Matrix<T> XYMoment = XMatTransposed * YVec;
+  return XMatNormalInverse * XYMoment;
+}
+
+/// (X^T * W * X)^-1 * X^T * W * y
+template <typename T, typename LHSTy, typename RHSTy>
+Matrix<T> getWeightedLeastSquaresEstimator(MatrixInterface<T, LHSTy> &XMat,
+                                           MatrixInterface<T, RHSTy> &YVec,
+                                           MatrixInterface<T, RHSTy> &WMat) {
+  assert(XMat.getNumRows() >= XMat.getNumColumns());
+  assert(isDiagonalMatrix(WMat));
+  assert(XMat.getNumRows() == WMat.getNumRows());
+  assert(XMat.getNumRows() == YVec.getNumRows());
+  assert(YVec.getNumColumns() == 1);
+
+  auto XMatTransposed = XMat.getTransposedMatrix();
+  Matrix<T> XMatTransposedWeighted = XMatTransposed * WMat;
+  Matrix<T> XMatTransposedWeightedNormal = XMatTransposedWeighted * XMat;
+  Matrix<T> XMatTransposedWeightedNormalInverse =
+      getInverseMatrix<T>(XMatTransposedWeightedNormal);
+  Matrix<T> XYMomentWeighted = XMatTransposedWeighted * YVec;
+  return XMatTransposedWeightedNormalInverse * XYMomentWeighted;
 }
 
 ///--------------------------------------------------------------------------///
diff --git a/llvm/lib/MC/MCSchedule.cpp b/llvm/lib/MC/MCSchedule.cpp
--- a/llvm/lib/MC/MCSchedule.cpp
+++ b/llvm/lib/MC/MCSchedule.cpp
@@ -160,6 +160,7 @@
   for (const MCReadAdvanceEntry &E : Entries) {
     if (E.WriteResourceID != WriteResourceID)
       continue;
+    llvm::errs() << "&E = " << &E << ", cycles = " << E.Cycles << "\n";
     DelayCycles = std::min(DelayCycles, E.Cycles);
   }
 
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver3.td b/llvm/lib/Target/X86/X86ScheduleZnver3.td
--- a/llvm/lib/Target/X86/X86ScheduleZnver3.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver3.td
@@ -482,7 +482,7 @@
 // AMD SOG 19h, 2.11 Floating-Point Unit
 // There is 1 cycle of added latency for a result to cross
 // from F to I or I to F domain.
-def : ReadAdvance<ReadInt2Fpu, -1>;
+def : ReadAdvance<ReadInt2Fpu, -42>;
 
 // Instructions with both a load and a store folded are modeled as a folded
 // load + WriteRMW.
diff --git a/llvm/test/tools/llvm-exegesis/X86/analysis-latency-instruction-chaining-domain-transfer.test b/llvm/test/tools/llvm-exegesis/X86/analysis-latency-instruction-chaining-domain-transfer.test
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-exegesis/X86/analysis-latency-instruction-chaining-domain-transfer.test
@@ -0,0 +1,52 @@
+# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-clusters-output-file=- -analysis-clustering-epsilon=0.5 -analysis-inconsistency-epsilon=0.5 -analysis-numpoints=1 | FileCheck -check-prefixes=CHECK %s
+
+# CHECK: {{^}}cluster_id,opcode_name,config,sched_class,latency{{$}}
+
+# CHECK-NEXT: {{^}}0,
+# CHECK-SAME: ,10.08{{$}}
+
+# CHECK: {{^}}1,
+# CHECK-SAME: ,11.07{{$}}
+
+# PINSRBrr has latency of 2 cycles. (the value from scheduling profile!)
+# But int to fpu units data transfer causes additional latency of 10 cycles.
+# Thus the actual latency of VPEXTRBrr is 10..11.
+
+---
+mode:            latency
+key:
+  instructions:
+    - 'VPEXTRBrr R15D XMM3 i_0x1'
+    - 'PINSRBrr XMM3 XMM3 R15D i_0x1'
+  config:          ''
+  register_initial_values:
+    - 'XMM3=0x0'
+cpu_name:        bdver2
+llvm_triple:     x86_64-unknown-linux-gnu
+num_repetitions: 10000
+measurements:
+  - { key: latency, value: 0.0000, per_snippet_value: 22.0802 }
+error:           ''
+info:            Repeating two instructions
+assembled_snippet: 41574883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F1C244883C410C4C37914DF0166410F3A20DF01C4C37914DF0166410F3A20DF01C4C37914DF0166410F3A20DF01C4C37914DF0166410F3A20DF01C4C37914DF0166410F3A20DF01C4C37914DF0166410F3A20DF01C4C37914DF0166410F3A20DF01C4C37914DF0166410F3A20DF01415FC3
+...
+---
+mode:            latency
+key:
+  instructions:
+    - 'PEXTRBrr ESI XMM7 i_0x1'
+    - 'VCVTSI642SSrr XMM7 XMM12 RSI'
+  config:          ''
+  register_initial_values:
+    - 'XMM7=0x0'
+    - 'XMM12=0x0'
+    - 'RSI=0x0'
+cpu_name:        bdver2
+llvm_triple:     x86_64-unknown-linux-gnu
+num_repetitions: 10000
+measurements:
+  - { key: latency, value: 12.533, per_snippet_value: 25.066 }
+error:           ''
+info:            Repeating two instructions
+assembled_snippet: 4883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F3C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F24244883C41048BE0000000000000000660F3A14FE01C4E19A2AFE660F3A14FE01C4E19A2AFE660F3A14FE01C4E19A2AFE660F3A14FE01C4E19A2AFE660F3A14FE01C4E19A2AFE660F3A14FE01C4E19A2AFE660F3A14FE01C4E19A2AFE660F3A14FE01C4E19A2AFEC3
+...
diff --git a/llvm/test/tools/llvm-exegesis/X86/analysis-latency-instruction-chaining.test b/llvm/test/tools/llvm-exegesis/X86/analysis-latency-instruction-chaining.test
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-exegesis/X86/analysis-latency-instruction-chaining.test
@@ -0,0 +1,57 @@
+# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-clusters-output-file=- -analysis-clustering-epsilon=0.5 -analysis-inconsistency-epsilon=0.5 -analysis-numpoints=1 | FileCheck -check-prefixes=CHECK %s
+
+# CHECK: {{^}}cluster_id,opcode_name,config,sched_class,latency{{$}}
+
+# CHECK-NEXT: {{^}}0,
+# CHECK-SAME: ,1.00{{$}}
+# CHECK-NEXT: {{^}}0,
+# CHECK-SAME: ,1.00{{$}}
+
+# Instructions were executed serially, meaning that the next instruction
+# *ONLY* starts executing when the current instruction finishes.
+# Thus, the real latency of the first instruction is the per_snippet_value minus
+# the sum of latencies of all the other instructions in the snippet.
+
+# RCR8rCL has latency of 11. (the value from scheduling profile!)
+# Latency of whole snipped is 12 or 23. (not measured, hand-written.)
+# Thus, latency of BT32rr is 12-11 = 1, or 23-11-11 = 1
+
+---
+mode:            latency
+key:
+  instructions:
+    - 'BT32rr R11D R11D'
+    - 'RCR8rCL R11B R11B'
+  config:          ''
+  register_initial_values:
+    - 'R11D=0x0'
+    - 'R11B=0x0'
+    - 'CL=0x0'
+cpu_name:        bdver2
+llvm_triple:     x86_64-unknown-linux-gnu
+num_repetitions: 10000
+measurements:
+  - { key: latency, value: 50.0000, per_snippet_value: 100.0000 }
+error:           ''
+info:            Repeating two instructions
+assembled_snippet: 41BB0000000041B300B100450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DBC3
+...
+---
+mode:            latency
+key:
+  instructions:
+    - 'RCR8rCL R11B R11B'
+  config:          ''
+  register_initial_values:
+    - 'R11D=0x0'
+    - 'R11B=0x0'
+    - 'CL=0x0'
+cpu_name:        bdver2
+llvm_triple:     x86_64-unknown-linux-gnu
+num_repetitions: 10000
+measurements:
+  - { key: latency, value: 25.0000, per_snippet_value: 25.0000 }
+error:           ''
+info:            Repeating two instructions
+assembled_snippet: 41BB0000000041B300B100450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DB450FA3DB41D2DBC3
+...
diff --git a/llvm/tools/llvm-exegesis/lib/Analysis.h b/llvm/tools/llvm-exegesis/lib/Analysis.h
--- a/llvm/tools/llvm-exegesis/lib/Analysis.h
+++ b/llvm/tools/llvm-exegesis/lib/Analysis.h
@@ -37,6 +37,7 @@
 class Analysis {
 public:
   Analysis(const Target &Target, std::unique_ptr<MCInstrInfo> InstrInfo,
+           std::unique_ptr<MCSubtargetInfo> SubtargetInfo,
            const InstructionBenchmarkClustering &Clustering,
            double AnalysisInconsistencyEpsilon,
            bool AnalysisDisplayUnstableOpcodes,
diff --git a/llvm/tools/llvm-exegesis/lib/Analysis.cpp b/llvm/tools/llvm-exegesis/lib/Analysis.cpp
--- a/llvm/tools/llvm-exegesis/lib/Analysis.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Analysis.cpp
@@ -152,11 +152,13 @@
 }
 
 Analysis::Analysis(const Target &Target, std::unique_ptr<MCInstrInfo> InstrInfo,
+                   std::unique_ptr<MCSubtargetInfo> SubtargetInfo,
                    const InstructionBenchmarkClustering &Clustering,
                    double AnalysisInconsistencyEpsilon,
                    bool AnalysisDisplayUnstableOpcodes,
                    const std::string &ForceCpuName)
-    : Clustering_(Clustering), InstrInfo_(std::move(InstrInfo)),
+    : Clustering_(Clustering), SubtargetInfo_(std::move(SubtargetInfo)),
+      InstrInfo_(std::move(InstrInfo)),
       AnalysisInconsistencyEpsilonSquared_(AnalysisInconsistencyEpsilon *
                                            AnalysisInconsistencyEpsilon),
       AnalysisDisplayUnstableOpcodes_(AnalysisDisplayUnstableOpcodes) {
@@ -170,8 +172,6 @@
   MCTargetOptions MCOptions;
   AsmInfo_.reset(
       Target.createMCAsmInfo(*RegInfo_, FirstPoint.LLVMTriple, MCOptions));
-  SubtargetInfo_.reset(
-      Target.createMCSubtargetInfo(FirstPoint.LLVMTriple, CpuName, ""));
   InstPrinter_.reset(Target.createMCInstPrinter(
       Triple(FirstPoint.LLVMTriple), 0 /*default variant*/, *AsmInfo_,
       *InstrInfo_, *RegInfo_));
@@ -290,6 +290,8 @@
   default:
     llvm_unreachable("invalid mode");
   }
+  if (Point.Info == "WLS fixpoint" || Point.Info == "WLS reconstruction")
+    OS << " <small><i>(" << Point.Info << ")</i></small>";
   OS << "</span> <span class=\"mono\">";
   writeEscaped<kEscapeHtml>(OS, Point.Key.Config);
   OS << "</span></li>";
diff --git a/llvm/tools/llvm-exegesis/lib/CMakeLists.txt b/llvm/tools/llvm-exegesis/lib/CMakeLists.txt
--- a/llvm/tools/llvm-exegesis/lib/CMakeLists.txt
+++ b/llvm/tools/llvm-exegesis/lib/CMakeLists.txt
@@ -55,6 +55,7 @@
   MCInstrDescView.cpp
   ParallelSnippetGenerator.cpp
   PerfHelper.cpp
+  PostProcessing.cpp
   RegisterAliasing.cpp
   RegisterValue.cpp
   SchedClassResolution.cpp
diff --git a/llvm/tools/llvm-exegesis/lib/PostProcessing.h b/llvm/tools/llvm-exegesis/lib/PostProcessing.h
new file mode 100644
--- /dev/null
+++ b/llvm/tools/llvm-exegesis/lib/PostProcessing.h
@@ -0,0 +1,31 @@
+//===-- PostProcessing.h ----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Post-processing for the benchmark points.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_EXEGESIS_POSTPROCESSING_H
+#define LLVM_TOOLS_LLVM_EXEGESIS_POSTPROCESSING_H
+
+#include "BenchmarkResult.h"
+#include <vector>
+
+namespace llvm {
+namespace exegesis {
+
+void PostProcessChainedLatencyBenchmarkPoints(
+    std::vector<InstructionBenchmark> &Points,
+    const llvm::MCInstrInfo &InstrInfo,
+    const llvm::MCSubtargetInfo &SubtargetInfo);
+
+} // namespace exegesis
+} // namespace llvm
+
+#endif // LLVM_TOOLS_LLVM_EXEGESIS_POSTPROCESSING_H
diff --git a/llvm/tools/llvm-exegesis/lib/PostProcessing.cpp b/llvm/tools/llvm-exegesis/lib/PostProcessing.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/tools/llvm-exegesis/lib/PostProcessing.cpp
@@ -0,0 +1,178 @@
+//===-- PostProcessing.cpp --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "PostProcessing.h"
+#include "Clustering.h"
+#include "SchedClassResolution.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Support/LinearAlgebra.h"
+#include <utility>
+
+namespace llvm {
+namespace exegesis {
+
+static std::pair<double, bool>
+getAdjustedLatency(const InstructionBenchmark *RelevantPoint,
+                   const llvm::MCInstrInfo &InstrInfo,
+                   const llvm::MCSubtargetInfo &SubtargetInfo) {
+  assert(RelevantPoint->Measurements.size() == 1);
+  assert(RelevantPoint->Measurements[0].Key == "latency");
+  double Latency = RelevantPoint->Measurements[0].PerSnippetValue;
+  bool HaveForwardingDelays = false;
+
+  if (RelevantPoint->Key.Instructions.size() == 1)
+    return {Latency, HaveForwardingDelays};
+
+  const MCSchedModel &SM = SubtargetInfo.getSchedModel();
+  for (const MCInst &Inst : RelevantPoint->Key.Instructions) {
+    const MCInstrDesc &MCDesc = InstrInfo.get(Inst.getOpcode());
+
+    // Obtain the scheduling class information from the instruction.
+    unsigned SchedClassID = MCDesc.getSchedClass();
+    unsigned CPUID = SM.getProcessorID();
+
+    // Try to solve variant scheduling classes.
+    while (SchedClassID && SM.getSchedClassDesc(SchedClassID)->isVariant())
+      SchedClassID = SubtargetInfo.resolveVariantSchedClass(SchedClassID, &Inst,
+                                                            &InstrInfo, CPUID);
+
+    const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID);
+    unsigned ForwardingDelayCycles = MCSchedModel::getForwardingDelayCycles(
+        SubtargetInfo.getReadAdvanceEntries(SCDesc));
+    HaveForwardingDelays |= ForwardingDelayCycles != 0;
+  }
+
+  return {Latency, HaveForwardingDelays};
+}
+
+void PostProcessChainedLatencyBenchmarkPoints(
+    std::vector<InstructionBenchmark> &Points,
+    const llvm::MCInstrInfo &InstrInfo,
+    const llvm::MCSubtargetInfo &SubtargetInfo) {
+  auto IsLatencyPoint = [](const InstructionBenchmark &Point) {
+    return Point.Mode == InstructionBenchmark::ModeE::Latency &&
+           Point.Error.empty();
+  };
+  auto IsChainedLatencyPoint =
+      [IsLatencyPoint](const InstructionBenchmark &Point) {
+        return IsLatencyPoint(Point) && Point.Key.Instructions.size() >= 2;
+      };
+
+  unsigned NumOpcodes = InstrInfo.getNumOpcodes();
+
+  // Which opcodes were ever chained in all of the benchmark points?
+  // Only record the ones for which we succeeded in measuring latency.
+  std::vector<int> OpcodeToIndex(NumOpcodes, /*Index=*/-1);
+  std::vector<MCInst> IndexToOpcode;
+  IndexToOpcode.reserve(NumOpcodes);
+  for (const InstructionBenchmark &Point :
+       make_filter_range(Points, IsChainedLatencyPoint)) {
+    for (const MCInst &Instruction : Point.Key.Instructions) {
+      const unsigned Opcode = Instruction.getOpcode();
+      assert(Opcode < NumOpcodes && "NumOpcodes is incorrect (too small)");
+      if (OpcodeToIndex[Opcode] != -1) // Already seen chained?
+        continue;
+      OpcodeToIndex[Opcode] = IndexToOpcode.size();
+      IndexToOpcode.emplace_back(Instruction);
+    }
+  }
+
+  if (IndexToOpcode.empty())
+    return; // Lucky us.
+
+  // Remember all the points that contained any opcode that was ever chained.
+  // We can not do this in the previous loop, because if Opc0 was chained with
+  // Opc1, we'll miss all standalone points for Opc1 before seeing the chaining.
+  // We store indexes into Points to avoid iterator invalidation.
+  SmallVector<const InstructionBenchmark *, 64> RelevantPoints;
+  for (const InstructionBenchmark &Point :
+       make_filter_range(Points, IsLatencyPoint)) {
+    if (any_of(Point.Key.Instructions,
+               [OpcodeToIndex](const MCInst &Instruction) {
+                 return OpcodeToIndex[Instruction.getOpcode()] != -1;
+               }))
+      RelevantPoints.emplace_back(&Point);
+  }
+
+  using namespace linearalgebra;
+
+  errs() << "rows = " << RelevantPoints.size() << "\n";
+  errs() << "cols = " << IndexToOpcode.size() << "\n";
+  Matrix<double> OpcodeChaining(RelevantPoints.size(), IndexToOpcode.size());
+  Matrix<double> ForwardingDelayPresence(RelevantPoints.size(), 1);
+  Matrix<double> SnippetLatency(RelevantPoints.size(), 1);
+  Matrix<double> Weights = getIdentityMatrix<double>(RelevantPoints.size());
+
+  std::vector<char> FixpointOpcodes(NumOpcodes, /*IsFixpoint=*/false);
+
+  for (auto I : enumerate(RelevantPoints)) {
+    int row = I.index();
+    const InstructionBenchmark *RelevantPoint = I.value();
+
+    // FIXME: this
+    std::tie(SnippetLatency(row, 0), ForwardingDelayPresence(row, 0)) =
+        getAdjustedLatency(RelevantPoint, InstrInfo, SubtargetInfo);
+
+    if (RelevantPoint->Key.Instructions.size() == 1) {
+      // Give a(n arbitrary) bonus to the points that directly measured
+      // one single specific instruction. We believe these measurements
+      // to be precise, so the fit should basically not change them.
+      Weights(row, row) = 1e+3;
+      FixpointOpcodes[RelevantPoint->Key.Instructions.front().getOpcode()] =
+          true;
+    }
+
+    for (const MCInst &Instrn : RelevantPoint->Key.Instructions) {
+      double &Entry = OpcodeChaining(row, OpcodeToIndex[Instrn.getOpcode()]);
+      assert(Entry == 0);
+      Entry = 1;
+    }
+  }
+  RelevantPoints.clear();
+  OpcodeToIndex.clear();
+
+  auto Zz = getAugmentedMatrix(OpcodeChaining, ForwardingDelayPresence);
+  auto EstimatedInstructionLatencies =
+      getWeightedLeastSquaresEstimator<double>(Zz, SnippetLatency, Weights);
+  assert(EstimatedInstructionLatencies.getNumColumns() == 1);
+  assert((size_t)EstimatedInstructionLatencies.getNumRows() >=
+         IndexToOpcode.size());
+
+  //  Points.erase(
+  //      std::remove_if(Points.begin(), Points.end(), IsChainedLatencyPoint),
+  //      Points.end());
+  Points.clear();
+
+  Points.reserve(Points.size() + IndexToOpcode.size());
+  for (auto I : enumerate(IndexToOpcode)) {
+    const MCInst &Instruction = I.value();
+    const double EstimatedInstructionLatency =
+        EstimatedInstructionLatencies(I.index(), 0);
+
+    Points.emplace_back();
+    InstructionBenchmark &NewPoint = Points.back();
+
+    NewPoint.Key.Instructions.emplace_back(Instruction);
+    NewPoint.Mode = InstructionBenchmark::Latency;
+    NewPoint.CpuName = Points.front().CpuName;
+    NewPoint.LLVMTriple = Points.front().LLVMTriple;
+    NewPoint.Measurements.emplace_back(
+        BenchmarkMeasure::Create("latency", EstimatedInstructionLatency));
+    NewPoint.Info = FixpointOpcodes[Instruction.getOpcode()]
+                        ? "WLS fixpoint"
+                        : "WLS reconstruction";
+  }
+  for (int I = IndexToOpcode.size();
+       I < EstimatedInstructionLatencies.getNumRows(); ++I)
+    errs() << "Reconstructed forwaring delay = "
+           << EstimatedInstructionLatencies(I, 0) << "\n";
+}
+
+} // namespace exegesis
+} // namespace llvm
diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
--- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
+++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
@@ -18,6 +18,7 @@
 #include "lib/Error.h"
 #include "lib/LlvmState.h"
 #include "lib/PerfHelper.h"
+#include "lib/PostProcessing.h"
 #include "lib/SnippetFile.h"
 #include "lib/SnippetRepetitor.h"
 #include "lib/Target.h"
@@ -409,7 +410,7 @@
 
   // Read benchmarks.
   const LLVMState State("");
-  const std::vector<InstructionBenchmark> Points = ExitOnFileError(
+  std::vector<InstructionBenchmark> Points = ExitOnFileError(
       BenchmarkFile, InstructionBenchmark::readYamls(State, BenchmarkFile));
 
   outs() << "Parsed " << Points.size() << " benchmark points\n";
@@ -420,6 +421,7 @@
   // FIXME: Check that all points have the same triple/cpu.
   // FIXME: Merge points from several runs (latency and uops).
 
+
   std::string Error;
   const auto *TheTarget =
       TargetRegistry::lookupTarget(Points[0].LLVMTriple, Error);
@@ -431,13 +433,19 @@
   std::unique_ptr<MCInstrInfo> InstrInfo(TheTarget->createMCInstrInfo());
   assert(InstrInfo && "Unable to create instruction info!");
 
+  std::unique_ptr<MCSubtargetInfo> SubtargetInfo(
+      TheTarget->createMCSubtargetInfo(Points[0].LLVMTriple, CpuName, ""));
+  assert(SubtargetInfo && "Unable to create subtarget info!");
+
+  PostProcessChainedLatencyBenchmarkPoints(Points, *InstrInfo, *SubtargetInfo);
+
   const auto Clustering = ExitOnErr(InstructionBenchmarkClustering::create(
       Points, AnalysisClusteringAlgorithm, AnalysisDbscanNumPoints,
       AnalysisClusteringEpsilon, InstrInfo->getNumOpcodes()));
 
-  const Analysis Analyzer(*TheTarget, std::move(InstrInfo), Clustering,
-                          AnalysisInconsistencyEpsilon,
-                          AnalysisDisplayUnstableOpcodes, CpuName);
+  const Analysis Analyzer(
+      *TheTarget, std::move(InstrInfo), std::move(SubtargetInfo), Clustering,
+      AnalysisInconsistencyEpsilon, AnalysisDisplayUnstableOpcodes, CpuName);
 
   maybeRunAnalysis<Analysis::PrintClusters>(Analyzer, "analysis clusters",
                                             AnalysisClustersOutputFile);
diff --git a/llvm/unittests/Support/LinearAlgebraTest.cpp b/llvm/unittests/Support/LinearAlgebraTest.cpp
--- a/llvm/unittests/Support/LinearAlgebraTest.cpp
+++ b/llvm/unittests/Support/LinearAlgebraTest.cpp
@@ -175,7 +175,7 @@
   y(0, 0) = 5;
   y(1, 0) = 6;
 
-  auto beta = getOrdinaryLeastSquaresEstimation<double>(M, y);
+  auto beta = getOrdinaryLeastSquaresEstimator<double>(M, y);
   EXPECT_EQ(std::vector<double>({4, 1. / 2}), getAllValues(beta));
 }
 
@@ -199,7 +199,7 @@
   y(2, 0) = 7;
   y(3, 0) = 10;
 
-  auto beta = getOrdinaryLeastSquaresEstimation<double>(M, y);
+  auto beta = getOrdinaryLeastSquaresEstimator<double>(M, y);
   EXPECT_EQ(std::vector<double>({7. / 2, 7. / 5}), getAllValues(beta));
 }