Index: docs/CommandGuide/llvm-exegesis.rst
===================================================================
--- docs/CommandGuide/llvm-exegesis.rst
+++ docs/CommandGuide/llvm-exegesis.rst
@@ -67,7 +67,7 @@
 .. code-block:: bash
 
   #!/bin/bash
-  readonly INSTRUCTIONS=$(grep INSTRUCTION_LIST_END build/lib/Target/X86/X86GenInstrInfo.inc | cut -f2 -d=)
+  readonly INSTRUCTIONS=$(($(grep INSTRUCTION_LIST_END build/lib/Target/X86/X86GenInstrInfo.inc | cut -f2 -d=) - 1))
   for INSTRUCTION in $(seq 1 ${INSTRUCTIONS});
   do
     ./build/bin/llvm-exegesis -mode=latency -opcode-index=${INSTRUCTION} | sed -n '/---/,$p'
Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2283,8 +2283,11 @@
     return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), N1, N0, CarryIn);
 
   // fold (addcarry x, y, false) -> (uaddo x, y)
-  if (isNullConstant(CarryIn))
-    return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1);
+  if (isNullConstant(CarryIn)) {
+    if (!LegalOperations ||
+        TLI.isOperationLegalOrCustom(ISD::UADDO, N->getValueType(0)))
+      return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1);
+  }
 
   // fold (addcarry 0, 0, X) -> (and (ext/trunc X), 1) and no carry.
   if (isNullConstant(N0) && isNullConstant(N1)) {
@@ -2592,8 +2595,11 @@
   SDValue CarryIn = N->getOperand(2);
 
   // fold (subcarry x, y, false) -> (usubo x, y)
-  if (isNullConstant(CarryIn))
-    return DAG.getNode(ISD::USUBO, SDLoc(N), N->getVTList(), N0, N1);
+  if (isNullConstant(CarryIn)) {
+    if (!LegalOperations ||
+        TLI.isOperationLegalOrCustom(ISD::USUBO, N->getValueType(0)))
+      return DAG.getNode(ISD::USUBO, SDLoc(N), N->getVTList(), N0, N1);
+  }
 
   return SDValue();
 }
Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3499,15 +3499,25 @@
   case ISD::USUBO: {
     SDValue LHS = Node->getOperand(0);
     SDValue RHS = Node->getOperand(1);
-    SDValue Sum = DAG.getNode(Node->getOpcode() == ISD::UADDO ?
-                              ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
-                              LHS, RHS);
+    bool IsAdd = Node->getOpcode() == ISD::UADDO;
+    // If ADD/SUBCARRY is legal, use that instead.
+    unsigned OpcCarry = IsAdd ? ISD::ADDCARRY : ISD::SUBCARRY;
+    if (TLI.isOperationLegalOrCustom(OpcCarry, Node->getValueType(0))) {
+      SDValue CarryIn = DAG.getConstant(0, dl, Node->getValueType(1));
+      SDValue NodeCarry = DAG.getNode(OpcCarry, dl, Node->getVTList(),
+                                      { LHS, RHS, CarryIn });
+      Results.push_back(SDValue(NodeCarry.getNode(), 0));
+      Results.push_back(SDValue(NodeCarry.getNode(), 1));
+      break;
+    }
+
+    SDValue Sum = DAG.getNode(IsAdd ? ISD::ADD : ISD::SUB, dl,
+                              LHS.getValueType(), LHS, RHS);
     Results.push_back(Sum);
 
     EVT ResultType = Node->getValueType(1);
     EVT SetCCType = getSetCCResultType(Node->getValueType(0));
-    ISD::CondCode CC
-      = Node->getOpcode() == ISD::UADDO ? ISD::SETULT : ISD::SETUGT;
+    ISD::CondCode CC = IsAdd ? ISD::SETULT : ISD::SETUGT;
     SDValue SetCC = DAG.getSetCC(dl, SetCCType, Sum, LHS, CC);
 
     Results.push_back(DAG.getBoolExtOrTrunc(SetCC, dl, ResultType, ResultType));
Index: lib/Target/Hexagon/HexagonISelDAGToDAG.h
===================================================================
--- lib/Target/Hexagon/HexagonISelDAGToDAG.h
+++ lib/Target/Hexagon/HexagonISelDAGToDAG.h
@@ -105,6 +105,7 @@
   void SelectV65Gather(SDNode *N);
   void SelectV65GatherPred(SDNode *N);
   void SelectHVXDualOutput(SDNode *N);
+  void SelectAddSubCarry(SDNode *N);
   void SelectVAlign(SDNode *N);
   void SelectVAlignAddr(SDNode *N);
   void SelectTypecast(SDNode *N);
Index: lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
===================================================================
--- lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -762,6 +762,15 @@
   ReplaceNode(N, R);
 }
 
+void HexagonDAGToDAGISel::SelectAddSubCarry(SDNode *N) {
+  unsigned OpcCarry = N->getOpcode() == HexagonISD::ADDC ? Hexagon::A4_addp_c
+                                                         : Hexagon::A4_subp_c;
+  SDNode *C = CurDAG->getMachineNode(OpcCarry, SDLoc(N), N->getVTList(),
+                                     { N->getOperand(0), N->getOperand(1),
+                                       N->getOperand(2) });
+  ReplaceNode(N, C);
+}
+
 void HexagonDAGToDAGISel::SelectVAlign(SDNode *N) {
   MVT ResTy = N->getValueType(0).getSimpleVT();
   if (HST->isHVXVectorType(ResTy, true))
@@ -875,6 +884,9 @@
   case ISD::STORE:                return SelectStore(N);
   case ISD::INTRINSIC_W_CHAIN:    return SelectIntrinsicWChain(N);
   case ISD::INTRINSIC_WO_CHAIN:   return SelectIntrinsicWOChain(N);
+
+  case HexagonISD::ADDC:
+  case HexagonISD::SUBC:          return SelectAddSubCarry(N);
   case HexagonISD::VALIGN:        return SelectVAlign(N);
   case HexagonISD::VALIGNADDR:    return SelectVAlignAddr(N);
   case HexagonISD::TYPECAST:      return SelectTypecast(N);
Index: lib/Target/Hexagon/HexagonISelLowering.h
===================================================================
--- lib/Target/Hexagon/HexagonISelLowering.h
+++ lib/Target/Hexagon/HexagonISelLowering.h
@@ -36,6 +36,8 @@
 
       CONST32 = OP_BEGIN,
       CONST32_GP,  // For marking data present in GP.
+      ADDC,        // Add with carry: (X, Y, Cin) -> (X+Y, Cout).
+      SUBC,        // Sub with carry: (X, Y, Cin) -> (X+~Y+Cin, Cout).
       ALLOCA,
 
       AT_GOT,      // Index in GOT.
@@ -162,6 +164,7 @@
     SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerUnalignedLoad(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerAddSubCarry(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const;
Index: lib/Target/Hexagon/HexagonISelLowering.cpp
===================================================================
--- lib/Target/Hexagon/HexagonISelLowering.cpp
+++ lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -1327,13 +1327,18 @@
     setMinimumJumpTableEntries(std::numeric_limits<int>::max());
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
 
-  // Only add and sub that detect overflow are the saturating ones.
+  // Hexagon has A4_addp_c and A4_subp_c that take and generate a carry bit,
+  // but they only operate on i64.
   for (MVT VT : MVT::integer_valuetypes()) {
-    setOperationAction(ISD::UADDO, VT, Expand);
-    setOperationAction(ISD::SADDO, VT, Expand);
-    setOperationAction(ISD::USUBO, VT, Expand);
-    setOperationAction(ISD::SSUBO, VT, Expand);
+    setOperationAction(ISD::UADDO,    VT, Expand);
+    setOperationAction(ISD::USUBO,    VT, Expand);
+    setOperationAction(ISD::SADDO,    VT, Expand);
+    setOperationAction(ISD::SSUBO,    VT, Expand);
+    setOperationAction(ISD::ADDCARRY, VT, Expand);
+    setOperationAction(ISD::SUBCARRY, VT, Expand);
   }
+  setOperationAction(ISD::ADDCARRY, MVT::i64, Custom);
+  setOperationAction(ISD::SUBCARRY, MVT::i64, Custom);
 
   setOperationAction(ISD::CTLZ, MVT::i8,  Promote);
   setOperationAction(ISD::CTLZ, MVT::i16, Promote);
@@ -1681,6 +1686,8 @@
 
 const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((HexagonISD::NodeType)Opcode) {
+  case HexagonISD::ADDC:          return "HexagonISD::ADDC";
+  case HexagonISD::SUBC:          return "HexagonISD::SUBC";
   case HexagonISD::ALLOCA:        return "HexagonISD::ALLOCA";
   case HexagonISD::AT_GOT:        return "HexagonISD::AT_GOT";
   case HexagonISD::AT_PCREL:      return "HexagonISD::AT_PCREL";
@@ -2705,6 +2712,24 @@
   return M;
 }
 
+SDValue
+HexagonTargetLowering::LowerAddSubCarry(SDValue Op, SelectionDAG &DAG) const {
+  const SDLoc &dl(Op);
+  unsigned Opc = Op.getOpcode();
+  SDValue X = Op.getOperand(0), Y = Op.getOperand(1), C = Op.getOperand(2);
+
+  if (Opc == ISD::ADDCARRY)
+    return DAG.getNode(HexagonISD::ADDC, dl, Op.getNode()->getVTList(),
+                       { X, Y, C });
+
+  EVT CarryTy = C.getValueType();
+  SDValue SubC = DAG.getNode(HexagonISD::SUBC, dl, Op.getNode()->getVTList(),
+                             { X, Y, DAG.getLogicalNOT(dl, C, CarryTy) });
+  SDValue Out[] = { SubC.getValue(0),
+                    DAG.getLogicalNOT(dl, SubC.getValue(1), CarryTy) };
+  return DAG.getMergeValues(Out, dl);
+}
+
 SDValue
 HexagonTargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain     = Op.getOperand(0);
@@ -2763,6 +2788,8 @@
     case ISD::VECTOR_SHUFFLE:       return LowerVECTOR_SHUFFLE(Op, DAG);
     case ISD::BITCAST:              return LowerBITCAST(Op, DAG);
     case ISD::LOAD:                 return LowerUnalignedLoad(Op, DAG);
+    case ISD::ADDCARRY:
+    case ISD::SUBCARRY:             return LowerAddSubCarry(Op, DAG);
     case ISD::SRA:
     case ISD::SHL:
     case ISD::SRL:                  return LowerVECTOR_SHIFT(Op, DAG);
Index: lib/Target/PowerPC/PPCISelDAGToDAG.cpp
===================================================================
--- lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -2253,7 +2253,7 @@
     // set of bit groups, and then mask in the zeros at the end. With early
     // masking, we only insert the non-zero parts of the result at every step.
 
-    unsigned InstCnt, InstCntLateMask;
+    unsigned InstCnt = 0, InstCntLateMask = 0;
     LLVM_DEBUG(dbgs() << "\tEarly masking:\n");
     SDNode *RN = Select(N, false, &InstCnt);
     LLVM_DEBUG(dbgs() << "\t\tisel would use " << InstCnt << " instructions\n");
Index: test/CodeGen/Hexagon/adde.ll
===================================================================
--- test/CodeGen/Hexagon/adde.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-; RUN: llc -march=hexagon -hexagon-expand-condsets=0 < %s | FileCheck %s
-
-; CHECK-DAG: r{{[0-9]+:[0-9]+}} = add(r{{[0-9]+:[0-9]+}},r{{[0-9]+:[0-9]+}})
-; CHECK-DAG: r{{[0-9]+:[0-9]+}} = add(r{{[0-9]+:[0-9]+}},r{{[0-9]+:[0-9]+}})
-; CHECK-DAG: p{{[0-9]+}} = cmp.gtu(r{{[0-9]+:[0-9]+}},r{{[0-9]+:[0-9]+}})
-; CHECK-DAG: p{{[0-9]+}} = cmp.gtu(r{{[0-9]+:[0-9]+}},r{{[0-9]+:[0-9]+}})
-; CHECK-DAG: r{{[0-9]+}} = mux(p{{[0-9]+}},r{{[0-9]+}},r{{[0-9]+}})
-; CHECK-DAG: r{{[0-9]+}} = mux(p{{[0-9]+}},r{{[0-9]+}},r{{[0-9]+}})
-
-define void @check_adde_addc(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64* %a4, i64* %a5) {
-b6:
-  %v7 = zext i64 %a0 to i128
-  %v8 = zext i64 %a1 to i128
-  %v9 = shl i128 %v8, 64
-  %v10 = or i128 %v7, %v9
-  %v11 = zext i64 %a2 to i128
-  %v12 = zext i64 %a3 to i128
-  %v13 = shl i128 %v12, 64
-  %v14 = or i128 %v11, %v13
-  %v15 = add i128 %v10, %v14
-  %v16 = lshr i128 %v15, 64
-  %v17 = trunc i128 %v15 to i64
-  %v18 = trunc i128 %v16 to i64
-  store i64 %v17, i64* %a4
-  store i64 %v18, i64* %a5
-  ret void
-}
Index: test/CodeGen/Hexagon/addsubcarry.ll
===================================================================
--- /dev/null
+++ test/CodeGen/Hexagon/addsubcarry.ll
@@ -0,0 +1,25 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+@g = global i128 zeroinitializer, align 8
+
+; CHECK-LABEL: addc:
+; CHECK: p[[P0:[0-3]]] = and(p[[P1:[0-9]]],!p[[P1]])
+; CHECK: add({{.*}},{{.*}},p[[P0]]):carry
+; CHECK: add({{.*}},{{.*}},p[[P0]]):carry
+define void @addc(i128 %a0, i128 %a1) #0 {
+  %v0 = add i128 %a0, %a1
+  store i128 %v0, i128* @g, align 8
+  ret void
+}
+
+; CHECK-LABEL: subc:
+; CHECK: p[[P0:[0-3]]] = or(p[[P1:[0-9]]],!p[[P1]])
+; CHECK: sub({{.*}},{{.*}},p[[P0]]):carry
+; CHECK: sub({{.*}},{{.*}},p[[P0]]):carry
+define void @subc(i128 %a0, i128 %a1) #0 {
+  %v0 = sub i128 %a0, %a1
+  store i128 %v0, i128* @g, align 8
+  ret void
+}
+
+
Index: test/CodeGen/Hexagon/sube.ll
===================================================================
--- test/CodeGen/Hexagon/sube.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: llc -march=hexagon -hexagon-expand-condsets=0 < %s | FileCheck %s
-
-; CHECK-DAG: r{{[0-9]+:[0-9]+}} = sub(r{{[0-9]+:[0-9]+}},r{{[0-9]+:[0-9]+}})
-; CHECK-DAG: r{{[0-9]+:[0-9]+}} = sub(r{{[0-9]+:[0-9]+}},r{{[0-9]+:[0-9]+}})
-; CHECK-DAG: p{{[0-9]+}} = cmp.gtu(r{{[0-9]+:[0-9]+}},r{{[0-9]+:[0-9]+}})
-; CHECK-DAG: r{{[0-9]+}} = mux(p{{[0-9]+}},r{{[0-9]+}},r{{[0-9]+}})
-; CHECK-DAG: r{{[0-9]+}} = mux(p{{[0-9]+}},r{{[0-9]+}},r{{[0-9]+}})
-
-define void @check_sube_subc(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64* %a4, i64* %a5) {
-b6:
-  %v7 = zext i64 %a0 to i128
-  %v8 = zext i64 %a1 to i128
-  %v9 = shl i128 %v8, 64
-  %v10 = or i128 %v7, %v9
-  %v11 = zext i64 %a2 to i128
-  %v12 = zext i64 %a3 to i128
-  %v13 = shl i128 %v12, 64
-  %v14 = or i128 %v11, %v13
-  %v15 = sub i128 %v10, %v14
-  %v16 = lshr i128 %v15, 64
-  %v17 = trunc i128 %v15 to i64
-  %v18 = trunc i128 %v16 to i64
-  store i64 %v17, i64* %a4
-  store i64 %v18, i64* %a5
-  ret void
-}
Index: test/Transforms/InstCombine/sdiv-guard.ll
===================================================================
--- /dev/null
+++ test/Transforms/InstCombine/sdiv-guard.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+declare void @llvm.experimental.guard(i1, ...)
+
+; FIXME: If %flag is false then %s == 0 and guard should be triggered.
+
+define i32 @a(i1 %flag, i32 %X) nounwind readnone {
+; CHECK-LABEL: @a(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[X:%.*]], 0
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[CMP]]) #1 [ "deopt"() ]
+; CHECK-NEXT:    [[R:%.*]] = sdiv i32 100, [[X]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %s = select i1 %flag, i32 %X, i32 0
+  %cmp = icmp ne i32 %s, 0
+  call void(i1, ...) @llvm.experimental.guard( i1 %cmp )[ "deopt"() ]
+  %r = sdiv i32 100, %s
+  ret i32 %r
+}
+
Index: tools/llvm-exegesis/lib/Analysis.h
===================================================================
--- tools/llvm-exegesis/lib/Analysis.h
+++ tools/llvm-exegesis/lib/Analysis.h
@@ -21,6 +21,7 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include <set>
 #include <string>
 #include <unordered_map>
 
@@ -40,11 +41,62 @@
   template <typename Pass> llvm::Error run(llvm::raw_ostream &OS) const;
 
 private:
+  using ClusterId = InstructionBenchmarkClustering::ClusterId;
+  // An llvm::MCSchedClassDesc augmented with some additional data.
+  struct SchedClass {
+    SchedClass(const llvm::MCSchedClassDesc &SD,
+               const llvm::MCSubtargetInfo &STI);
+
+    const llvm::MCSchedClassDesc &SCDesc;
+    const llvm::SmallVector<llvm::MCWriteProcResEntry, 8>
+        NonRedundantWriteProcRes;
+    const std::vector<std::pair<uint16_t, float>> IdealizedProcResPressure;
+  };
+
+  // Represents the intersection of a sched class and a cluster.
+  class SchedClassCluster {
+  public:
+    const InstructionBenchmarkClustering::ClusterId &id() const {
+      return ClusterId;
+    }
+    const std::vector<size_t> &getPointIds() const { return PointIds; }
+    const std::vector<BenchmarkMeasureStats> &getRepresentative() const {
+      return Representative;
+    }
+
+    // Returns true if the cluster representative measurements match that of SC.
+    bool
+    measurementsMatch(const llvm::MCSubtargetInfo &STI, const SchedClass &SC,
+                      const InstructionBenchmarkClustering &Clustering) const;
+
+    void addPoint(size_t PointId,
+                  const InstructionBenchmarkClustering &Clustering) {
+      PointIds.push_back(PointId);
+      const auto &Point = Clustering.getPoints()[PointId];
+      if (ClusterId.isUndef()) {
+        ClusterId = Clustering.getClusterIdForPoint(PointId);
+        Representative.resize(Point.Measurements.size());
+      }
+      for (size_t I = 0, E = Point.Measurements.size(); I < E; ++I) {
+        Representative[I].push(Point.Measurements[I]);
+      }
+      assert(ClusterId == Clustering.getClusterIdForPoint(PointId));
+    }
+
+  private:
+    InstructionBenchmarkClustering::ClusterId ClusterId;
+    std::vector<size_t> PointIds;
+    // Measurement stats for the points in the SchedClassCluster.
+    std::vector<BenchmarkMeasureStats> Representative;
+  };
+
   void printInstructionRowCsv(size_t PointId, llvm::raw_ostream &OS) const;
 
-  void printSchedClassClustersHtml(std::vector<size_t> PointIds,
-                                   llvm::raw_ostream &OS) const;
-  void printSchedClassDescHtml(const llvm::MCSchedClassDesc &SCDesc,
+  void
+  printSchedClassClustersHtml(const std::vector<SchedClassCluster> &Clusters,
+                              const SchedClass &SC,
+                              llvm::raw_ostream &OS) const;
+  void printSchedClassDescHtml(const SchedClass &SC,
                                llvm::raw_ostream &OS) const;
 
   // Builds a map of Sched Class -> indices of points that belong to the sched
@@ -52,12 +104,22 @@
   std::unordered_map<unsigned, std::vector<size_t>>
   makePointsPerSchedClass() const;
 
+  // Returns true of the data for this cluster matches the checked-in llvm data
+  // for this class.
+
   const InstructionBenchmarkClustering &Clustering_;
   std::unique_ptr<llvm::MCSubtargetInfo> SubtargetInfo_;
   std::unique_ptr<llvm::MCInstrInfo> InstrInfo_;
   std::unordered_map<std::string, unsigned> MnemonicToOpcode_;
 };
 
+// Computes the idealized ProcRes Unit pressure. This is the expected
+// distribution if the CPU scheduler can distribute the load as evenly as
+// possible.
+std::vector<std::pair<uint16_t, float>> computeIdealizedProcResPressure(
+    const llvm::MCSchedModel &SM,
+    llvm::SmallVector<llvm::MCWriteProcResEntry, 8> WPRS);
+
 } // namespace exegesis
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_CLUSTERING_H
Index: tools/llvm-exegesis/lib/Analysis.cpp
===================================================================
--- tools/llvm-exegesis/lib/Analysis.cpp
+++ tools/llvm-exegesis/lib/Analysis.cpp
@@ -9,6 +9,7 @@
 
 #include "Analysis.h"
 #include "BenchmarkResult.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/FormatVariadic.h"
 #include <unordered_set>
 #include <vector>
@@ -167,20 +168,15 @@
   return PointsPerSchedClass;
 }
 
-void Analysis::printSchedClassClustersHtml(std::vector<size_t> PointIds,
-                                           llvm::raw_ostream &OS) const {
-  assert(!PointIds.empty());
-  // Sort the points by cluster id so that we can display them grouped by
-  // cluster.
-  std::sort(PointIds.begin(), PointIds.end(),
-            [this](const size_t A, const size_t B) {
-              return Clustering_.getClusterIdForPoint(A) <
-                     Clustering_.getClusterIdForPoint(B);
-            });
+void Analysis::printSchedClassClustersHtml(
+    const std::vector<SchedClassCluster> &Clusters, const SchedClass &SC,
+    llvm::raw_ostream &OS) const {
   const auto &Points = Clustering_.getPoints();
   OS << "<table class=\"sched-class-clusters\">";
   OS << "<tr><th>ClusterId</th><th>Opcode/Config</th>";
-  for (const auto &Measurement : Points[PointIds[0]].Measurements) {
+  assert(!Clusters.empty());
+  for (const auto &Measurement :
+       Points[Clusters[0].getPointIds()[0]].Measurements) {
     OS << "<th>";
     if (Measurement.DebugString.empty())
       writeEscaped<kEscapeHtml>(OS, Measurement.Key);
@@ -189,29 +185,24 @@
     OS << "</th>";
   }
   OS << "</tr>";
-  for (size_t I = 0, E = PointIds.size(); I < E;) {
-    const auto &CurrentClusterId =
-        Clustering_.getClusterIdForPoint(PointIds[I]);
-    OS << "<tr><td>";
-    writeClusterId<kEscapeHtml>(OS, CurrentClusterId);
+  for (const SchedClassCluster &Cluster : Clusters) {
+    OS << "<tr class=\""
+       << (Cluster.measurementsMatch(*SubtargetInfo_, SC, Clustering_)
+               ? "good-cluster"
+               : "bad-cluster")
+       << "\"><td>";
+    writeClusterId<kEscapeHtml>(OS, Cluster.id());
     OS << "</td><td><ul>";
-    std::vector<BenchmarkMeasureStats> MeasurementStats(
-        Points[PointIds[I]].Measurements.size());
-    for (; I < E &&
-           Clustering_.getClusterIdForPoint(PointIds[I]) == CurrentClusterId;
-         ++I) {
-      const auto &Point = Points[PointIds[I]];
+    for (const size_t PointId : Cluster.getPointIds()) {
+      const auto &Point = Points[PointId];
       OS << "<li><span class=\"mono\">";
       writeEscaped<kEscapeHtml>(OS, Point.Key.OpcodeName);
       OS << "</span> <span class=\"mono\">";
       writeEscaped<kEscapeHtml>(OS, Point.Key.Config);
       OS << "</span></li>";
-      for (size_t J = 0, F = Point.Measurements.size(); J < F; ++J) {
-        MeasurementStats[J].push(Point.Measurements[J]);
-      }
     }
     OS << "</ul></td>";
-    for (const auto &Stats : MeasurementStats) {
+    for (const auto &Stats : Cluster.getRepresentative()) {
       OS << "<td class=\"measurement\">";
       writeMeasurementValue<kEscapeHtml>(OS, Stats.avg());
       OS << "<br><span class=\"minmax\">[";
@@ -299,22 +290,87 @@
   return Result;
 }
 
-void Analysis::printSchedClassDescHtml(const llvm::MCSchedClassDesc &SCDesc,
+Analysis::SchedClass::SchedClass(const llvm::MCSchedClassDesc &SD,
+                                 const llvm::MCSubtargetInfo &STI)
+    : SCDesc(SD),
+      NonRedundantWriteProcRes(getNonRedundantWriteProcRes(SD, STI)),
+      IdealizedProcResPressure(computeIdealizedProcResPressure(
+          STI.getSchedModel(), NonRedundantWriteProcRes)) {}
+
+bool Analysis::SchedClassCluster::measurementsMatch(
+    const llvm::MCSubtargetInfo &STI, const SchedClass &SC,
+    const InstructionBenchmarkClustering &Clustering) const {
+  const size_t NumMeasurements = Representative.size();
+  std::vector<BenchmarkMeasure> ClusterCenterPoint(NumMeasurements);
+  std::vector<BenchmarkMeasure> SchedClassPoint(NumMeasurements);
+  // Latency case.
+  assert(!Clustering.getPoints().empty());
+  const std::string &Mode = Clustering.getPoints()[0].Key.Mode;
+  if (Mode == "latency") { // FIXME: use an enum.
+    if (NumMeasurements != 1) {
+      llvm::errs()
+          << "invalid number of measurements in latency mode: expected 1, got "
+          << NumMeasurements << "\n";
+      return false;
+    }
+    // Find the latency.
+    SchedClassPoint[0].Value = 0.0;
+    for (unsigned I = 0; I < SC.SCDesc.NumWriteLatencyEntries; ++I) {
+      const llvm::MCWriteLatencyEntry *const WLE =
+          STI.getWriteLatencyEntry(&SC.SCDesc, I);
+      SchedClassPoint[0].Value =
+          std::max<double>(SchedClassPoint[0].Value, WLE->Cycles);
+    }
+    ClusterCenterPoint[0].Value = Representative[0].avg();
+  } else if (Mode == "uops") {
+    for (int I = 0, E = Representative.size(); I < E; ++I) {
+      // Find the pressure on ProcResIdx `Key`.
+      uint16_t ProcResIdx = 0;
+      if (!llvm::to_integer(Representative[I].key(), ProcResIdx, 10)) {
+        llvm::errs() << "expected ProcResIdx key, got "
+                     << Representative[I].key() << "\n";
+        return false;
+      }
+      const auto ProcResPressureIt =
+          std::find_if(SC.IdealizedProcResPressure.begin(),
+                       SC.IdealizedProcResPressure.end(),
+                       [ProcResIdx](const std::pair<uint16_t, float> &WPR) {
+                         return WPR.first == ProcResIdx;
+                       });
+      SchedClassPoint[I].Value =
+          ProcResPressureIt == SC.IdealizedProcResPressure.end()
+              ? 0.0
+              : ProcResPressureIt->second;
+      ClusterCenterPoint[I].Value = Representative[I].avg();
+    }
+  } else {
+    llvm::errs() << "unimplemented measurement matching for mode ''" << Mode
+                 << "''\n";
+    return false;
+  }
+  return Clustering.isNeighbour(ClusterCenterPoint, SchedClassPoint);
+}
+
+void Analysis::printSchedClassDescHtml(const SchedClass &SC,
                                        llvm::raw_ostream &OS) const {
   OS << "<table class=\"sched-class-desc\">";
   OS << "<tr><th>Valid</th><th>Variant</th><th>uOps</th><th>Latency</"
-        "th><th>WriteProcRes</th></tr>";
-  if (SCDesc.isValid()) {
+        "th><th>WriteProcRes</th><th title=\"This is the idealized unit "
+        "resource (port) pressure assuming ideal distribution\">Idealized "
+        "Resource Pressure</th></tr>";
+  if (SC.SCDesc.isValid()) {
+    const auto &SM = SubtargetInfo_->getSchedModel();
     OS << "<tr><td>&#10004;</td>";
-    OS << "<td>" << (SCDesc.isVariant() ? "&#10004;" : "&#10005;") << "</td>";
-    OS << "<td>" << SCDesc.NumMicroOps << "</td>";
+    OS << "<td>" << (SC.SCDesc.isVariant() ? "&#10004;" : "&#10005;")
+       << "</td>";
+    OS << "<td>" << SC.SCDesc.NumMicroOps << "</td>";
     // Latencies.
     OS << "<td><ul>";
-    for (int I = 0, E = SCDesc.NumWriteLatencyEntries; I < E; ++I) {
+    for (int I = 0, E = SC.SCDesc.NumWriteLatencyEntries; I < E; ++I) {
       const auto *const Entry =
-          SubtargetInfo_->getWriteLatencyEntry(&SCDesc, I);
+          SubtargetInfo_->getWriteLatencyEntry(&SC.SCDesc, I);
       OS << "<li>" << Entry->Cycles;
-      if (SCDesc.NumWriteLatencyEntries > 1) {
+      if (SC.SCDesc.NumWriteLatencyEntries > 1) {
         // Dismabiguate if more than 1 latency.
         OS << " (WriteResourceID " << Entry->WriteResourceID << ")";
       }
@@ -323,13 +379,23 @@
     OS << "</ul></td>";
     // WriteProcRes.
     OS << "<td><ul>";
-    for (const auto &WPR :
-         getNonRedundantWriteProcRes(SCDesc, *SubtargetInfo_)) {
+    for (const auto &WPR : SC.NonRedundantWriteProcRes) {
+      OS << "<li><span class=\"mono\">";
+      writeEscaped<kEscapeHtml>(OS,
+                                SM.getProcResource(WPR.ProcResourceIdx)->Name);
+      OS << "</span>: " << WPR.Cycles << "</li>";
+    }
+    OS << "</ul></td>";
+    // Idealized port pressure.
+    OS << "<td><ul>";
+    for (const auto &Pressure : SC.IdealizedProcResPressure) {
       OS << "<li><span class=\"mono\">";
       writeEscaped<kEscapeHtml>(OS, SubtargetInfo_->getSchedModel()
-                                        .getProcResource(WPR.ProcResourceIdx)
+                                        .getProcResource(Pressure.first)
                                         ->Name);
-      OS << "</span>: " << WPR.Cycles << "</li>";
+      OS << "</span>: ";
+      writeMeasurementValue<kEscapeHtml>(OS, Pressure.second);
+      OS << "</li>";
     }
     OS << "</ul></td>";
     OS << "</tr>";
@@ -386,12 +452,21 @@
 span.mono {
   font-family: monospace;
 }
-span.minmax {
-  color: #888;
-}
 td.measurement {
   text-align: center;
 }
+tr.good-cluster td.measurement {
+  color: #292
+}
+tr.bad-cluster td.measurement {
+  color: #922
+}
+tr.good-cluster td.measurement span.minmax {
+  color: #888;
+}
+tr.bad-cluster td.measurement span.minmax {
+  color: #888;
+}
 </style>
 </head>
 )";
@@ -399,46 +474,65 @@
 template <>
 llvm::Error Analysis::run<Analysis::PrintSchedClassInconsistencies>(
     llvm::raw_ostream &OS) const {
+  const auto &FirstPoint = Clustering_.getPoints()[0];
   // Print the header.
   OS << "<!DOCTYPE html><html>" << kHtmlHead << "<body>";
   OS << "<h1><span class=\"mono\">llvm-exegesis</span> Analysis Results</h1>";
   OS << "<h3>Triple: <span class=\"mono\">";
-  writeEscaped<kEscapeHtml>(OS, Clustering_.getPoints()[0].LLVMTriple);
+  writeEscaped<kEscapeHtml>(OS, FirstPoint.LLVMTriple);
   OS << "</span></h3><h3>Cpu: <span class=\"mono\">";
-  writeEscaped<kEscapeHtml>(OS, Clustering_.getPoints()[0].CpuName);
+  writeEscaped<kEscapeHtml>(OS, FirstPoint.CpuName);
   OS << "</span></h3>";
 
-  // All the points in a scheduling class should be in the same cluster.
-  // Print any scheduling class for which this is not the case.
   for (const auto &SchedClassAndPoints : makePointsPerSchedClass()) {
-    std::unordered_set<size_t> ClustersForSchedClass;
-    for (const size_t PointId : SchedClassAndPoints.second) {
+    const auto SchedClassId = SchedClassAndPoints.first;
+    const std::vector<size_t> &SchedClassPoints = SchedClassAndPoints.second;
+    const auto &SchedModel = SubtargetInfo_->getSchedModel();
+    const llvm::MCSchedClassDesc *const SCDesc =
+        SchedModel.getSchedClassDesc(SchedClassId);
+    if (!SCDesc)
+      continue;
+    const SchedClass SC(*SCDesc, *SubtargetInfo_);
+
+    // Bucket sched class points into sched class clusters.
+    std::vector<SchedClassCluster> SchedClassClusters;
+    for (const size_t PointId : SchedClassPoints) {
       const auto &ClusterId = Clustering_.getClusterIdForPoint(PointId);
       if (!ClusterId.isValid())
-        continue; // Ignore noise and errors.
-      ClustersForSchedClass.insert(ClusterId.getId());
+        continue; // Ignore noise and errors. FIXME: take noise into account ?
+      auto SchedClassClusterIt =
+          std::find_if(SchedClassClusters.begin(), SchedClassClusters.end(),
+                       [ClusterId](const SchedClassCluster &C) {
+                         return C.id() == ClusterId;
+                       });
+      if (SchedClassClusterIt == SchedClassClusters.end()) {
+        SchedClassClusters.emplace_back();
+        SchedClassClusterIt = std::prev(SchedClassClusters.end());
+      }
+      SchedClassClusterIt->addPoint(PointId, Clustering_);
     }
-    if (ClustersForSchedClass.size() <= 1)
+
+    // Print any scheduling class that has at least one cluster that does not
+    // match the checked-in data.
+    if (std::all_of(SchedClassClusters.begin(), SchedClassClusters.end(),
+                    [this, &SC](const SchedClassCluster &C) {
+                      return C.measurementsMatch(*SubtargetInfo_, SC,
+                                                 Clustering_);
+                    }))
       continue; // Nothing weird.
 
-    const auto &SchedModel = SubtargetInfo_->getSchedModel();
-    const llvm::MCSchedClassDesc *const SCDesc =
-        SchedModel.getSchedClassDesc(SchedClassAndPoints.first);
-    if (!SCDesc)
-      continue;
     OS << "<div class=\"inconsistency\"><p>Sched Class <span "
           "class=\"sched-class-name\">";
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
     writeEscaped<kEscapeHtml>(OS, SCDesc->Name);
 #else
-    OS << SchedClassAndPoints.first;
+    OS << SchedClassId;
 #endif
-    OS << "</span> contains instructions with distinct performance "
-          "characteristics, falling into "
-       << ClustersForSchedClass.size() << " clusters:</p>";
-    printSchedClassClustersHtml(SchedClassAndPoints.second, OS);
-    OS << "<p>llvm data:</p>";
-    printSchedClassDescHtml(*SCDesc, OS);
+    OS << "</span> contains instructions whose performance characteristics do"
+          " not match that of LLVM:</p>";
+    printSchedClassClustersHtml(SchedClassClusters, SC, OS);
+    OS << "<p>llvm SchedModel data:</p>";
+    printSchedClassDescHtml(SC, OS);
     OS << "</div>";
   }
 
@@ -446,4 +540,118 @@
   return llvm::Error::success();
 }
 
+// Distributes a pressure budget as evenly as possible on the provided subunits
+// given the already existing port pressure distribution.
+//
+// The algorithm is as follows: while there is remaining pressure to
+// distribute, find the subunits with minimal pressure, and distribute
+// remaining pressure equally up to the pressure of the unit with
+// second-to-minimal pressure.
+// For example, let's assume we want to distribute 2*P1256
+// (Subunits = [P1,P2,P5,P6]), and the starting DensePressure is:
+//     DensePressure =        P0   P1   P2   P3   P4   P5   P6   P7
+//                           0.1  0.3  0.2  0.0  0.0  0.5  0.5  0.5
+//     RemainingPressure = 2.0
+// We sort the subunits by pressure:
+//     Subunits = [(P2,p=0.2), (P1,p=0.3), (P5,p=0.5), (P6, p=0.5)]
+// We'll first start by the subunits with minimal pressure, which are at
+// the beginning of the sorted array. In this example there is one (P2).
+// The subunit with second-to-minimal pressure is the next one in the
+// array (P1). So we distribute 0.1 pressure to P2, and remove 0.1 cycles
+// from the budget.
+//     Subunits = [(P2,p=0.3), (P1,p=0.3), (P5,p=0.5), (P5,p=0.5)]
+//     RemainingPressure = 1.9
+// We repeat this process: distribute 0.2 pressure on each of the minimal
+// P2 and P1, decrease budget by 2*0.2:
+//     Subunits = [(P2,p=0.5), (P1,p=0.5), (P5,p=0.5), (P5,p=0.5)]
+//     RemainingPressure = 1.5
+// There are no second-to-minimal subunits so we just share the remaining
+// budget (1.5 cycles) equally:
+//     Subunits = [(P2,p=0.875), (P1,p=0.875), (P5,p=0.875), (P5,p=0.875)]
+//     RemainingPressure = 0.0
+// We stop as there is no remaining budget to distribute.
+void distributePressure(float RemainingPressure,
+                        llvm::SmallVector<uint16_t, 32> Subunits,
+                        llvm::SmallVector<float, 32> &DensePressure) {
+  // Find the number of subunits with minimal pressure (they are at the
+  // front).
+  llvm::sort(Subunits.begin(), Subunits.end(),
+             [&DensePressure](const uint16_t A, const uint16_t B) {
+               return DensePressure[A] < DensePressure[B];
+             });
+  const auto getPressureForSubunit = [&DensePressure,
+                                      &Subunits](size_t I) -> float & {
+    return DensePressure[Subunits[I]];
+  };
+  size_t NumMinimalSU = 1;
+  while (NumMinimalSU < Subunits.size() &&
+         getPressureForSubunit(NumMinimalSU) == getPressureForSubunit(0)) {
+    ++NumMinimalSU;
+  }
+  while (RemainingPressure > 0.0f) {
+    if (NumMinimalSU == Subunits.size()) {
+      // All units are minimal, just distribute evenly and be done.
+      for (size_t I = 0; I < NumMinimalSU; ++I) {
+        getPressureForSubunit(I) += RemainingPressure / NumMinimalSU;
+      }
+      return;
+    }
+    // Distribute the remaining pressure equally.
+    const float MinimalPressure = getPressureForSubunit(NumMinimalSU - 1);
+    const float SecondToMinimalPressure = getPressureForSubunit(NumMinimalSU);
+    assert(MinimalPressure < SecondToMinimalPressure);
+    const float Increment = SecondToMinimalPressure - MinimalPressure;
+    if (RemainingPressure <= NumMinimalSU * Increment) {
+      // There is not enough remaining pressure.
+      for (size_t I = 0; I < NumMinimalSU; ++I) {
+        getPressureForSubunit(I) += RemainingPressure / NumMinimalSU;
+      }
+      return;
+    }
+    // Bump all minimal pressure subunits to `SecondToMinimalPressure`.
+    for (size_t I = 0; I < NumMinimalSU; ++I) {
+      getPressureForSubunit(I) = SecondToMinimalPressure;
+      RemainingPressure -= SecondToMinimalPressure;
+    }
+    while (NumMinimalSU < Subunits.size() &&
+           getPressureForSubunit(NumMinimalSU) == SecondToMinimalPressure) {
+      ++NumMinimalSU;
+    }
+  }
+}
+
+std::vector<std::pair<uint16_t, float>> computeIdealizedProcResPressure(
+    const llvm::MCSchedModel &SM,
+    llvm::SmallVector<llvm::MCWriteProcResEntry, 8> WPRS) {
+  // DensePressure[I] is the port pressure for Proc Resource I.
+  llvm::SmallVector<float, 32> DensePressure(SM.getNumProcResourceKinds());
+  llvm::sort(WPRS.begin(), WPRS.end(),
+             [](const llvm::MCWriteProcResEntry &A,
+                const llvm::MCWriteProcResEntry &B) {
+               return A.ProcResourceIdx < B.ProcResourceIdx;
+             });
+  for (const llvm::MCWriteProcResEntry &WPR : WPRS) {
+    // Get units for the entry.
+    const llvm::MCProcResourceDesc *const ProcResDesc =
+        SM.getProcResource(WPR.ProcResourceIdx);
+    if (ProcResDesc->SubUnitsIdxBegin == nullptr) {
+      // This is a ProcResUnit.
+      DensePressure[WPR.ProcResourceIdx] += WPR.Cycles;
+    } else {
+      // This is a ProcResGroup.
+      llvm::SmallVector<uint16_t, 32> Subunits(ProcResDesc->SubUnitsIdxBegin,
+                                               ProcResDesc->SubUnitsIdxBegin +
+                                                   ProcResDesc->NumUnits);
+      distributePressure(WPR.Cycles, Subunits, DensePressure);
+    }
+  }
+  // Turn dense pressure into sparse pressure by removing zero entries.
+  std::vector<std::pair<uint16_t, float>> Pressure;
+  for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+    if (DensePressure[I] > 0.0f)
+      Pressure.emplace_back(I, DensePressure[I]);
+  }
+  return Pressure;
+}
+
 } // namespace exegesis
Index: tools/llvm-exegesis/lib/BenchmarkResult.h
===================================================================
--- tools/llvm-exegesis/lib/BenchmarkResult.h
+++ tools/llvm-exegesis/lib/BenchmarkResult.h
@@ -77,6 +77,8 @@
   double min() const { return MinValue; }
   double max() const { return MaxValue; }
 
+  const std::string &key() const { return Key; }
+
 private:
   std::string Key;
   double SumValues = 0.0;
Index: tools/llvm-exegesis/lib/Clustering.h
===================================================================
--- tools/llvm-exegesis/lib/Clustering.h
+++ tools/llvm-exegesis/lib/Clustering.h
@@ -33,12 +33,10 @@
   public:
     static ClusterId noise() { return ClusterId(kNoise); }
     static ClusterId error() { return ClusterId(kError); }
-    static ClusterId makeValid(size_t Id) {
-      return ClusterId(Id);
-    }
+    static ClusterId makeValid(size_t Id) { return ClusterId(Id); }
     ClusterId() : Id_(kUndef) {}
     bool operator==(const ClusterId &O) const { return Id_ == O.Id_; }
-    bool operator<(const ClusterId &O) const {return Id_ < O.Id_; }
+    bool operator<(const ClusterId &O) const { return Id_ < O.Id_; }
 
     bool isValid() const { return Id_ <= kMaxValid; }
     bool isUndef() const { return Id_ == kUndef; }
@@ -53,7 +51,8 @@
 
   private:
     explicit ClusterId(size_t Id) : Id_(Id) {}
-    static constexpr const size_t kMaxValid = std::numeric_limits<size_t>::max() - 4;
+    static constexpr const size_t kMaxValid =
+        std::numeric_limits<size_t>::max() - 4;
     static constexpr const size_t kNoise = kMaxValid + 1;
     static constexpr const size_t kError = kMaxValid + 2;
     static constexpr const size_t kUndef = kMaxValid + 3;
@@ -88,12 +87,19 @@
 
   const std::vector<Cluster> &getValidClusters() const { return Clusters_; }
 
+  // Returns true if the given point is within a distance Epsilon of each other.
+  bool isNeighbour(const std::vector<BenchmarkMeasure> &P,
+                   const std::vector<BenchmarkMeasure> &Q) const;
+
 private:
   InstructionBenchmarkClustering(
-      const std::vector<InstructionBenchmark> &Points);
+      const std::vector<InstructionBenchmark> &Points, double EpsilonSquared);
   llvm::Error validateAndSetup();
-  void dbScan(size_t MinPts, double EpsilonSquared);
+  void dbScan(size_t MinPts);
+  std::vector<size_t> rangeQuery(size_t Q) const;
+
   const std::vector<InstructionBenchmark> &Points_;
+  const double EpsilonSquared_;
   int NumDimensions_ = 0;
   // ClusterForPoint_[P] is the cluster id for Points[P].
   std::vector<ClusterId> ClusterIdForPoint_;
Index: tools/llvm-exegesis/lib/Clustering.cpp
===================================================================
--- tools/llvm-exegesis/lib/Clustering.cpp
+++ tools/llvm-exegesis/lib/Clustering.cpp
@@ -29,38 +29,41 @@
 // [1] https://en.wikipedia.org/wiki/DBSCAN
 // [2] https://en.wikipedia.org/wiki/OPTICS_algorithm
 
-namespace {
-
 // Finds the points at distance less than sqrt(EpsilonSquared) of Q (not
 // including Q).
-std::vector<size_t> rangeQuery(const std::vector<InstructionBenchmark> &Points,
-                               const size_t Q, const double EpsilonSquared) {
+std::vector<size_t>
+InstructionBenchmarkClustering::rangeQuery(const size_t Q) const {
   std::vector<size_t> Neighbors;
-  const auto &QMeasurements = Points[Q].Measurements;
-  for (size_t P = 0, NumPoints = Points.size(); P < NumPoints; ++P) {
+  const auto &QMeasurements = Points_[Q].Measurements;
+  for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) {
     if (P == Q)
       continue;
-    const auto &PMeasurements = Points[P].Measurements;
+    const auto &PMeasurements = Points_[P].Measurements;
     if (PMeasurements.empty()) // Error point.
       continue;
-    double DistanceSquared = 0;
-    for (size_t I = 0, E = QMeasurements.size(); I < E; ++I) {
-      const auto Diff = PMeasurements[I].Value - QMeasurements[I].Value;
-      DistanceSquared += Diff * Diff;
-    }
-    if (DistanceSquared <= EpsilonSquared) {
+    if (isNeighbour(PMeasurements, QMeasurements)) {
       Neighbors.push_back(P);
     }
   }
   return Neighbors;
 }
 
-} // namespace
+bool InstructionBenchmarkClustering::isNeighbour(
+    const std::vector<BenchmarkMeasure> &P,
+    const std::vector<BenchmarkMeasure> &Q) const {
+  double DistanceSquared = 0.0;
+  for (size_t I = 0, E = P.size(); I < E; ++I) {
+    const auto Diff = P[I].Value - Q[I].Value;
+    DistanceSquared += Diff * Diff;
+  }
+  return DistanceSquared <= EpsilonSquared_;
+}
 
 InstructionBenchmarkClustering::InstructionBenchmarkClustering(
-    const std::vector<InstructionBenchmark> &Points)
-    : Points_(Points), NoiseCluster_(ClusterId::noise()),
-      ErrorCluster_(ClusterId::error()) {}
+    const std::vector<InstructionBenchmark> &Points,
+    const double EpsilonSquared)
+    : Points_(Points), EpsilonSquared_(EpsilonSquared),
+      NoiseCluster_(ClusterId::noise()), ErrorCluster_(ClusterId::error()) {}
 
 llvm::Error InstructionBenchmarkClustering::validateAndSetup() {
   ClusterIdForPoint_.resize(Points_.size());
@@ -97,12 +100,11 @@
   return llvm::Error::success();
 }
 
-void InstructionBenchmarkClustering::dbScan(const size_t MinPts,
-                                            const double EpsilonSquared) {
+void InstructionBenchmarkClustering::dbScan(const size_t MinPts) {
   for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) {
     if (!ClusterIdForPoint_[P].isUndef())
       continue; // Previously processed in inner loop.
-    const auto Neighbors = rangeQuery(Points_, P, EpsilonSquared);
+    const auto Neighbors = rangeQuery(P);
     if (Neighbors.size() + 1 < MinPts) { // Density check.
       // The region around P is not dense enough to create a new cluster, mark
       // as noise for now.
@@ -136,7 +138,7 @@
       ClusterIdForPoint_[Q] = CurrentCluster.Id;
       CurrentCluster.PointIndices.push_back(Q);
       // And extend to the neighbors of Q if the region is dense enough.
-      const auto Neighbors = rangeQuery(Points_, Q, EpsilonSquared);
+      const auto Neighbors = rangeQuery(Q);
       if (Neighbors.size() + 1 >= MinPts) {
         ToProcess.insert(Neighbors.begin(), Neighbors.end());
       }
@@ -155,7 +157,7 @@
 InstructionBenchmarkClustering::create(
     const std::vector<InstructionBenchmark> &Points, const size_t MinPts,
     const double Epsilon) {
-  InstructionBenchmarkClustering Clustering(Points);
+  InstructionBenchmarkClustering Clustering(Points, Epsilon * Epsilon);
   if (auto Error = Clustering.validateAndSetup()) {
     return std::move(Error);
   }
@@ -163,7 +165,7 @@
     return Clustering; // Nothing to cluster.
   }
 
-  Clustering.dbScan(MinPts, Epsilon * Epsilon);
+  Clustering.dbScan(MinPts);
   return Clustering;
 }
 
Index: tools/llvm-mca/SummaryView.h
===================================================================
--- tools/llvm-mca/SummaryView.h
+++ tools/llvm-mca/SummaryView.h
@@ -45,10 +45,15 @@
   unsigned TotalCycles;
   // The total number of micro opcodes contributed by a block of instructions.
   unsigned NumMicroOps;
-  // For each processor resource, this map stores the cumulative number of
-  // resource cycles consumed by a block of instructions. The resource mask ID
-  // is used as the key value to access elements of this map.
-  llvm::DenseMap<uint64_t, unsigned> ProcResourceUsage;
+  // For each processor resource, this vector stores the cumulative number of
+  // resource cycles consumed by the analyzed code block.
+  llvm::SmallVector<unsigned, 8> ProcResourceUsage;
+
+  // Each processor resource is associated with a so-called processor resource
+  // mask. This vector allows to correlate processor resource IDs with processor
+  // resource masks. There is exactly one element per each processor resource
+  // declared by the scheduling model.
+  llvm::SmallVector<uint64_t, 8> ProcResourceMasks;
 
   // Compute the reciprocal throughput for the analyzed code block.
   // The reciprocal block throughput is computed as the MAX between:
@@ -58,9 +63,7 @@
 
 public:
   SummaryView(const llvm::MCSchedModel &Model, const SourceMgr &S,
-              unsigned Width)
-      : SM(Model), Source(S), DispatchWidth(Width), TotalCycles(0),
-        NumMicroOps(0) {}
+              unsigned Width);
 
   void onCycleEnd() override { ++TotalCycles; }
 
Index: tools/llvm-mca/SummaryView.cpp
===================================================================
--- tools/llvm-mca/SummaryView.cpp
+++ tools/llvm-mca/SummaryView.cpp
@@ -24,6 +24,14 @@
 
 using namespace llvm;
 
+SummaryView::SummaryView(const llvm::MCSchedModel &Model, const SourceMgr &S,
+                         unsigned Width)
+    : SM(Model), Source(S), DispatchWidth(Width), TotalCycles(0),
+      NumMicroOps(0), ProcResourceUsage(Model.getNumProcResourceKinds(), 0),
+      ProcResourceMasks(Model.getNumProcResourceKinds(), 0) {
+  computeProcResourceMasks(SM, ProcResourceMasks);
+}
+
 void SummaryView::onInstructionEvent(const HWInstructionEvent &Event) {
   // We are only interested in the "instruction dispatched" events generated by
   // the dispatch stage for instructions that are part of iteration #0.
@@ -41,48 +49,14 @@
   const InstrDesc &Desc = Inst.getDesc();
   NumMicroOps += Desc.NumMicroOps;
   for (const std::pair<uint64_t, const ResourceUsage> &RU : Desc.Resources) {
-    if (!RU.second.size())
-      continue;
-
-    assert(RU.second.NumUnits && "Expected more than one unit used!");
-    if (ProcResourceUsage.find(RU.first) == ProcResourceUsage.end()) {
-      ProcResourceUsage[RU.first] = RU.second.size();
-      continue;
-    }
-
-    ProcResourceUsage[RU.first] += RU.second.size();
-  }
-}
-
-double SummaryView::getBlockRThroughput() const {
-  assert(NumMicroOps && "Expected at least one micro opcode!");
-
-  SmallVector<uint64_t, 8> Masks(SM.getNumProcResourceKinds());
-  computeProcResourceMasks(SM, Masks);
-
-  // The block throughput is bounded from above by the hardware dispatch
-  // throughput. That is because the DispatchWidth is an upper bound on the
-  // number of opcodes that can be part of a single dispatch group.
-  double Max = static_cast<double>(NumMicroOps) / DispatchWidth;
-
-  // The block throughput is also limited by the amount of hardware parallelism.
-  // The number of available resource units affects the resource pressure
-  // distributed, as well as how many blocks can be executed every cycle.
-  for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
-    uint64_t Mask = Masks[I];
-    const auto It = ProcResourceUsage.find_as(Mask);
-    if (It != ProcResourceUsage.end()) {
-      const MCProcResourceDesc &MCDesc = *SM.getProcResource(I);
-      unsigned NumUnits = MCDesc.NumUnits;
-      double Throughput = static_cast<double>(It->second) / NumUnits;
-      Max = std::max(Max, Throughput);
+    if (RU.second.size()) {
+      const auto It = find(ProcResourceMasks, RU.first);
+      assert(It != ProcResourceMasks.end() &&
+             "Invalid processor resource mask!");
+      ProcResourceUsage[std::distance(ProcResourceMasks.begin(), It)] +=
+          RU.second.size();
     }
   }
-
-  // The block reciprocal throughput is computed as the MAX of:
-  //  -  (#uOps / DispatchWidth)
-  //  -  (#units / resource cycles) for every consumed processor resource.
-  return Max;
 }
 
 void SummaryView::printView(raw_ostream &OS) const {
@@ -90,7 +64,8 @@
   unsigned Instructions = Source.size();
   unsigned TotalInstructions = Instructions * Iterations;
   double IPC = (double)TotalInstructions / TotalCycles;
-  double BlockRThroughput = getBlockRThroughput();
+  double BlockRThroughput = computeBlockRThroughput(
+      SM, DispatchWidth, NumMicroOps, ProcResourceUsage);
 
   std::string Buffer;
   raw_string_ostream TempStream(Buffer);
Index: tools/llvm-mca/Support.h
===================================================================
--- tools/llvm-mca/Support.h
+++ tools/llvm-mca/Support.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_SUPPORT_H
 #define LLVM_TOOLS_LLVM_MCA_SUPPORT_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCSchedule.h"
 
@@ -44,6 +45,14 @@
 /// problems with simple bit manipulation operations.
 void computeProcResourceMasks(const llvm::MCSchedModel &SM,
                               llvm::SmallVectorImpl<uint64_t> &Masks);
+
+/// Compute the reciprocal block throughput from a set of processor resource
+/// cycles. The reciprocal block throughput is computed as the MAX between:
+///  - NumMicroOps / DispatchWidth
+///  - ProcResourceCycles / #ProcResourceUnits  (for every consumed resource).
+double computeBlockRThroughput(const llvm::MCSchedModel &SM,
+                               unsigned DispatchWidth, unsigned NumMicroOps,
+                               llvm::ArrayRef<unsigned> ProcResourceUsage);
 } // namespace mca
 
 #endif
Index: tools/llvm-mca/Support.cpp
===================================================================
--- tools/llvm-mca/Support.cpp
+++ tools/llvm-mca/Support.cpp
@@ -48,4 +48,32 @@
     ProcResourceID++;
   }
 }
+
+double computeBlockRThroughput(const MCSchedModel &SM, unsigned DispatchWidth,
+                               unsigned NumMicroOps,
+                               ArrayRef<unsigned> ProcResourceUsage) {
+  // The block throughput is bounded from above by the hardware dispatch
+  // throughput. That is because the DispatchWidth is an upper bound on the
+  // number of opcodes that can be part of a single dispatch group.
+  double Max = static_cast<double>(NumMicroOps) / DispatchWidth;
+
+  // The block throughput is also limited by the amount of hardware parallelism.
+  // The number of available resource units affects the resource pressure
+  // distribution, as well as how many blocks can be executed every cycle.
+  for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+    unsigned ResourceCycles = ProcResourceUsage[I];
+    if (!ResourceCycles)
+      continue;
+
+    const MCProcResourceDesc &MCDesc = *SM.getProcResource(I);
+    double Throughput = static_cast<double>(ResourceCycles) / MCDesc.NumUnits;
+    Max = std::max(Max, Throughput);
+  }
+
+  // The block reciprocal throughput is computed as the MAX of:
+  //  - (NumMicroOps / DispatchWidth)
+  //  - (NumUnits / ResourceCycles)   for every consumed processor resource.
+  return Max;
+}
+
 } // namespace mca
Index: unittests/tools/llvm-exegesis/X86/AnalysisTest.cpp
===================================================================
--- /dev/null
+++ unittests/tools/llvm-exegesis/X86/AnalysisTest.cpp
@@ -0,0 +1,102 @@
+#include "Analysis.h"
+
+#include <cassert>
+#include <memory>
+
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace exegesis {
+namespace {
+
+using testing::Pair;
+using testing::UnorderedElementsAre;
+
+class AnalysisTest : public ::testing::Test {
+protected:
+  AnalysisTest() {
+    const std::string TT = "x86_64-unknown-linux";
+    std::string error;
+    const llvm::Target *const TheTarget =
+        llvm::TargetRegistry::lookupTarget(TT, error);
+    if (!TheTarget) {
+      llvm::errs() << error << "\n";
+      return;
+    }
+    STI.reset(TheTarget->createMCSubtargetInfo(TT, "haswell", ""));
+
+    // Compute the ProxResIdx of ports unes in tests.
+    const auto &SM = STI->getSchedModel();
+    for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+      const std::string Name = SM.getProcResource(I)->Name;
+      if (Name == "HWPort0") {
+        P0Idx = I;
+      } else if (Name == "HWPort1") {
+        P1Idx = I;
+      } else if (Name == "HWPort5") {
+        P5Idx = I;
+      } else if (Name == "HWPort6") {
+        P6Idx = I;
+      } else if (Name == "HWPort05") {
+        P05Idx = I;
+      } else if (Name == "HWPort0156") {
+        P0156Idx = I;
+      }
+    }
+    EXPECT_NE(P0Idx, 0);
+    EXPECT_NE(P1Idx, 0);
+    EXPECT_NE(P5Idx, 0);
+    EXPECT_NE(P6Idx, 0);
+    EXPECT_NE(P05Idx, 0);
+    EXPECT_NE(P0156Idx, 0);
+  }
+
+  static void SetUpTestCase() {
+    LLVMInitializeX86TargetInfo();
+    LLVMInitializeX86Target();
+    LLVMInitializeX86TargetMC();
+  }
+
+protected:
+  std::unique_ptr<const llvm::MCSubtargetInfo> STI;
+  uint16_t P0Idx = 0;
+  uint16_t P1Idx = 0;
+  uint16_t P5Idx = 0;
+  uint16_t P6Idx = 0;
+  uint16_t P05Idx = 0;
+  uint16_t P0156Idx = 0;
+};
+
+TEST_F(AnalysisTest, ComputeIdealizedProcResPressure_2P0) {
+  const auto Pressure =
+      computeIdealizedProcResPressure(STI->getSchedModel(), {{P0Idx, 2}});
+  EXPECT_THAT(Pressure, UnorderedElementsAre(Pair(P0Idx, 2.0)));
+}
+
+TEST_F(AnalysisTest, ComputeIdealizedProcResPressure_2P05) {
+  const auto Pressure =
+      computeIdealizedProcResPressure(STI->getSchedModel(), {{P05Idx, 2}});
+  EXPECT_THAT(Pressure,
+              UnorderedElementsAre(Pair(P0Idx, 1.0), Pair(P5Idx, 1.0)));
+}
+
+TEST_F(AnalysisTest, ComputeIdealizedProcResPressure_2P05_2P0156) {
+  const auto Pressure = computeIdealizedProcResPressure(
+      STI->getSchedModel(), {{P05Idx, 2}, {P0156Idx, 2}});
+  EXPECT_THAT(Pressure,
+              UnorderedElementsAre(Pair(P0Idx, 1.0), Pair(P1Idx, 1.0),
+                                   Pair(P5Idx, 1.0), Pair(P6Idx, 1.0)));
+}
+
+TEST_F(AnalysisTest, ComputeIdealizedProcResPressure_1P1_1P05_2P0156) {
+  const auto Pressure = computeIdealizedProcResPressure(
+      STI->getSchedModel(), {{P1Idx, 1}, {P05Idx, 1}, {P0156Idx, 2}});
+  EXPECT_THAT(Pressure,
+              UnorderedElementsAre(Pair(P0Idx, 1.0), Pair(P1Idx, 1.0),
+                                   Pair(P5Idx, 1.0), Pair(P6Idx, 1.0)));
+}
+
+} // namespace
+} // namespace exegesis
Index: unittests/tools/llvm-exegesis/X86/CMakeLists.txt
===================================================================
--- unittests/tools/llvm-exegesis/X86/CMakeLists.txt
+++ unittests/tools/llvm-exegesis/X86/CMakeLists.txt
@@ -16,5 +16,6 @@
 add_llvm_unittest(LLVMExegesisX86Tests
   RegisterAliasingTest.cpp
   AssemblerTest.cpp
+  AnalysisTest.cpp
   )
 target_link_libraries(LLVMExegesisX86Tests PRIVATE LLVMExegesis)