Index: include/llvm/MC/MCSchedule.h
===================================================================
--- include/llvm/MC/MCSchedule.h
+++ include/llvm/MC/MCSchedule.h
@@ -24,9 +24,7 @@
 
 /// Define a kind of processor resource that will be modeled by the scheduler.
 struct MCProcResourceDesc {
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   const char *Name;
-#endif
   unsigned NumUnits; // Number of resource of this kind
   unsigned SuperIdx; // Index of the resources kind that contains this kind.
 
Index: test/tools/llvm-mca/ARM/lit.local.cfg
===================================================================
--- test/tools/llvm-mca/ARM/lit.local.cfg
+++ test/tools/llvm-mca/ARM/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'ARM' in config.root.targets:
+    config.unsupported = True
Index: test/tools/llvm-mca/ARM/simple-test-cortex-a9.s
===================================================================
--- test/tools/llvm-mca/ARM/simple-test-cortex-a9.s
+++ test/tools/llvm-mca/ARM/simple-test-cortex-a9.s
@@ -0,0 +1,37 @@
+# RUN: llvm-mca -mtriple=arm-eabi -mcpu=cortex-a9 -iterations=100 < %s | FileCheck %s
+
+vadd.f32 s0, s2, s2
+
+# CHECK: Iterations:     100
+# CHECK-NEXT: Instructions:   100
+# CHECK-NEXT: Total Cycles:   105
+# CHECK-NEXT: Dispatch Width: 2
+# CHECK-NEXT: IPC:            0.95
+
+# CHECK: Resources:
+# CHECK-NEXT: [0] - A9UnitAGU
+# CHECK-NEXT: [1.0] - A9UnitALU
+# CHECK-NEXT: [1.1] - A9UnitALU
+# CHECK-NEXT: [2] - A9UnitB
+# CHECK-NEXT: [3] - A9UnitFP
+# CHECK-NEXT: [4] - A9UnitLS
+# CHECK-NEXT: [5] - A9UnitMul
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1.0]  [1.1]  [2]    [3]    [4]    [5]
+# CHECK-NEXT: 1.00    -      -      -     1.00    -      -
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1.0]  [1.1]  [2]    [3]    [4]    [5]    	Instructions:
+# CHECK-NEXT: 1.00    -      -      -     1.00    -      -     	vadd.f32	s0, s2, s2
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects
+
+# CHECK: [1]    [2]    [3]    [4]    [5]    [6]	Instructions:
+# CHECK-NEXT:  1      4     1.00                    	vadd.f32	s0, s2, s2
Index: test/tools/llvm-mca/X86/cpus.s
===================================================================
--- test/tools/llvm-mca/X86/cpus.s
+++ test/tools/llvm-mca/X86/cpus.s
@@ -0,0 +1,27 @@
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=btver2  < %s | FileCheck --check-prefix=ALL --check-prefix=BTVER2 %s
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=znver1  < %s | FileCheck --check-prefix=ALL --check-prefix=ZNVER1 %s
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge  < %s | FileCheck --check-prefix=ALL --check-prefix=SANDYBRIDGE %s
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=ivybridge  < %s | FileCheck --check-prefix=ALL --check-prefix=IVYBRIDGE %s
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=haswell  < %s | FileCheck --check-prefix=ALL --check-prefix=HASWELL %s
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=broadwell  < %s | FileCheck --check-prefix=ALL --check-prefix=BROADWELL %s
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=knl  < %s | FileCheck --check-prefix=ALL --check-prefix=KNL %s
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=skylake  < %s | FileCheck --check-prefix=ALL --check-prefix=SKX %s
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512  < %s | FileCheck --check-prefix=ALL --check-prefix=SKX-AVX512 %s
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=slm  < %s | FileCheck --check-prefix=ALL --check-prefix=SLM %s
+
+add %edi, %eax
+
+# ALL: Iterations:     70
+# ALL-NEXT: Instructions:   70
+
+# BTVER2: Dispatch Width: 2
+# ZNVER1: Dispatch Width: 4
+# SANDYBRIDGE: Dispatch Width: 4
+# IVYBRIDGE: Dispatch Width: 4
+# HASWELL: Dispatch Width: 4
+# BROADWELL: Dispatch Width: 4
+# KNL: Dispatch Width: 4
+# SKX: Dispatch Width: 6
+# SKX-AVX512: Dispatch Width: 6
+# SLM: Dispatch Width: 2
+
Index: test/tools/llvm-mca/X86/default-iterations.s
===================================================================
--- test/tools/llvm-mca/X86/default-iterations.s
+++ test/tools/llvm-mca/X86/default-iterations.s
@@ -0,0 +1,11 @@
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 < %s 2>&1 | FileCheck --check-prefix=DEFAULT %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=0 < %s 2>&1 | FileCheck --check-prefix=DEFAULT %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 < %s 2>&1 | FileCheck --check-prefix=CUSTOM %s
+
+add %eax, %eax
+
+# DEFAULT: Iterations: 70
+# DEFAULT-NEXT: Instructions: 70
+
+# CUSTOM: Iterations: 1
+# CUSTOM-NEXT: Instructions: 1
Index: test/tools/llvm-mca/X86/dispatch_width.s
===================================================================
--- test/tools/llvm-mca/X86/dispatch_width.s
+++ test/tools/llvm-mca/X86/dispatch_width.s
@@ -0,0 +1,8 @@
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 < %s 2>&1 | FileCheck --check-prefix=DEFAULT %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -dispatch=0 < %s 2>&1 | FileCheck --check-prefix=DEFAULT %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -dispatch=1 < %s 2>&1 | FileCheck --check-prefix=CUSTOM %s
+
+add %eax, %eax
+
+# DEFAULT: Dispatch Width: 2
+# CUSTOM: Dispatch Width: 1
Index: test/tools/llvm-mca/X86/in-order-cpu.s
===================================================================
--- test/tools/llvm-mca/X86/in-order-cpu.s
+++ test/tools/llvm-mca/X86/in-order-cpu.s
@@ -0,0 +1,3 @@
+# RUN: not llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=atom -o %t1 2>&1 | FileCheck %s
+
+# CHECK: error: please specify an out-of-order cpu. 'atom' is an in-order cpu.
Index: test/tools/llvm-mca/X86/invalid-assembly-sequence.s
===================================================================
--- test/tools/llvm-mca/X86/invalid-assembly-sequence.s
+++ test/tools/llvm-mca/X86/invalid-assembly-sequence.s
@@ -0,0 +1,3 @@
+# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 %s
+
+invalid_instruction_mnemonic
Index: test/tools/llvm-mca/X86/invalid-cpu.s
===================================================================
--- test/tools/llvm-mca/X86/invalid-cpu.s
+++ test/tools/llvm-mca/X86/invalid-cpu.s
@@ -0,0 +1,3 @@
+# RUN: not llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=foo -o %t1 2>&1 | FileCheck %s
+
+# CHECK: 'foo' is not a recognized processor for this target (ignoring processor)
Index: test/tools/llvm-mca/X86/invalid-empty-file.s
===================================================================
--- test/tools/llvm-mca/X86/invalid-empty-file.s
+++ test/tools/llvm-mca/X86/invalid-empty-file.s
@@ -0,0 +1,3 @@
+# RUN: not llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -o %t1 2>&1 | FileCheck %s
+
+# CHECK: error: no assembly instructions found.
Index: test/tools/llvm-mca/X86/lit.local.cfg
===================================================================
--- test/tools/llvm-mca/X86/lit.local.cfg
+++ test/tools/llvm-mca/X86/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'X86' in config.root.targets:
+    config.unsupported = True
+
Index: test/tools/llvm-mca/X86/no-sched-model.s
===================================================================
--- test/tools/llvm-mca/X86/no-sched-model.s
+++ test/tools/llvm-mca/X86/no-sched-model.s
@@ -0,0 +1,4 @@
+# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown < %s 2>&1 | FileCheck %s
+# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=generic < %s 2>&1 | FileCheck %s
+
+# CHECK: error: unable to find instruction-level scheduling information for target triple 'x86_64-unknown-unknown' and cpu 'generic'.
Index: test/tools/llvm-mca/X86/simple-test-btver2.s
===================================================================
--- test/tools/llvm-mca/X86/simple-test-btver2.s
+++ test/tools/llvm-mca/X86/simple-test-btver2.s
@@ -0,0 +1,45 @@
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=100 < %s | FileCheck %s
+
+add %edi, %eax
+
+# CHECK: Iterations:     100
+# CHECK-NEXT: Instructions:   100
+# CHECK-NEXT: Total Cycles:   103
+# CHECK-NEXT: Dispatch Width: 2
+# CHECK-NEXT: IPC:            0.97
+
+# CHECK-LABEL: Resources:
+# CHECK-NEXT: [0] - JALU0
+# CHECK-NEXT: [1] - JALU1
+# CHECK-NEXT: [2] - JDiv
+# CHECK-NEXT: [3] - JFPA
+# CHECK-NEXT: [4] - JFPM
+# CHECK-NEXT: [5] - JFPU0
+# CHECK-NEXT: [6] - JFPU1
+# CHECK-NEXT: [7] - JLAGU
+# CHECK-NEXT: [8] - JMul
+# CHECK-NEXT: [9] - JSAGU
+# CHECK-NEXT: [10] - JSTC
+# CHECK-NEXT: [11] - JVALU0
+# CHECK-NEXT: [12] - JVALU1
+# CHECK-NEXT: [13] - JVIMUL
+
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]   [10]   [11]   [12]   [13]   	Instructions:
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     -      -      -      -     	addl	%edi, %eax
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects
+
+# CHECK: [1]    [2]    [3]    [4]    [5]    [6]	Instructions:
+# CHECK-NEXT: 1      1     0.50                    	addl	%edi, %eax
Index: test/tools/llvm-mca/invalid_input_file_name.test
===================================================================
--- test/tools/llvm-mca/invalid_input_file_name.test
+++ test/tools/llvm-mca/invalid_input_file_name.test
@@ -0,0 +1,3 @@
+# RUN: not llvm-mca %t.blah -o %t2 2>&1 | FileCheck --check-prefix=ENOENT %s
+
+# ENOENT: {{.*}}.blah: {{[Nn]}}o such file or directory
Index: test/tools/llvm-mca/lit.local.cfg
===================================================================
--- test/tools/llvm-mca/lit.local.cfg
+++ test/tools/llvm-mca/lit.local.cfg
@@ -0,0 +1,4 @@
+# Requires a non-empty default triple for these tests
+if 'default_triple' not in config.available_features:
+    config.unsupported = True
+
Index: tools/LLVMBuild.txt
===================================================================
--- tools/LLVMBuild.txt
+++ tools/LLVMBuild.txt
@@ -37,6 +37,7 @@
  llvm-link
  llvm-lto
  llvm-mc
+ llvm-mca
  llvm-mcmarkup
  llvm-modextract
  llvm-mt
Index: tools/llvm-mca/Backend.h
===================================================================
--- tools/llvm-mca/Backend.h
+++ tools/llvm-mca/Backend.h
@@ -0,0 +1,141 @@
+//===--------------------- Backend.h ----------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements an OoO backend for the llvm-mca tool.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_BACKEND_H
+#define LLVM_TOOLS_LLVM_MCA_BACKEND_H
+
+#include "Dispatch.h"
+#include "InstrBuilder.h"
+#include "Scheduler.h"
+#include "SourceMgr.h"
+
+namespace mca {
+
+using namespace llvm;
+
+struct HWEventListener;
+
+/// \brief An out of order backend for a specific subtarget.
+///
+/// It emulates an out-of-order execution of instructions. Instructions are
+/// fetched from a MCInst sequence managed by an object of class SourceMgr.
+/// Instructions are firstly dispatched to the schedulers and then executed.
+/// This class tracks the lifetime of an instruction from the moment where
+/// it gets dispatched to the schedulers, to the moment where it finishes
+/// executing and register writes are architecturally committed.
+/// In particular, it monitors changes in the state of every instruction
+/// in flight.
+/// Instructions are executed in a loop of iterations. The number of iterations
+/// is defined by the SourceMgr object.
+/// The Backend entrypoint is method 'Run()' which execute cycles in a loop
+/// until there are new instructions to dispatch, and not every instruction
+/// has been retired.
+/// Internally, the Backend collects statistical information in the form of
+/// histograms. For example, it tracks how the dispatch group size changes
+/// over time.
+class Backend {
+  const MCSubtargetInfo &STI;
+
+  std::unique_ptr<InstrBuilder> IB;
+  std::unique_ptr<Scheduler> HWS;
+  std::unique_ptr<DispatchUnit> DU;
+  std::unique_ptr<SourceMgr> SM;
+  unsigned Cycles;
+
+  DenseMap<unsigned, std::unique_ptr<Instruction>> Instructions;
+  std::set<HWEventListener *> Listeners;
+
+  void RunCycle(unsigned Cycle);
+
+public:
+  Backend(const MCSubtargetInfo &Subtarget, const MCInstrInfo &MCII,
+          const MCRegisterInfo &MRI, std::unique_ptr<SourceMgr> Source,
+          unsigned DispatchWidth = 0, unsigned RegisterFileSize = 0,
+          unsigned MaxRetirePerCycle = 0, unsigned LoadQueueSize = 0,
+          unsigned StoreQueueSize = 0, bool AssumeNoAlias = false)
+      : STI(Subtarget),
+        HWS(llvm::make_unique<Scheduler>(this, Subtarget.getSchedModel(),
+                                         LoadQueueSize, StoreQueueSize,
+                                         AssumeNoAlias)),
+        DU(llvm::make_unique<DispatchUnit>(
+            this, MRI, Subtarget.getSchedModel().MicroOpBufferSize,
+            RegisterFileSize, MaxRetirePerCycle, DispatchWidth, HWS.get())),
+        SM(std::move(Source)), Cycles(0) {
+    IB = llvm::make_unique<InstrBuilder>(MCII, GetProcResourceMasks());
+  }
+
+  int Run() {
+    while (SM->HasNext() || !DU->IsRCUEmpty())
+      RunCycle(Cycles++);
+    return 0;
+  }
+
+  unsigned GetNumIterations() const { return SM->GetNumIterations(); }
+  unsigned GetNumInstructions() const { return SM->size(); }
+  unsigned GetNumCycles() const { return Cycles; }
+  unsigned GetTotalRegisterMappingsCreated() const {
+    return DU->GetTotalRegisterMappingsCreated();
+  }
+  unsigned GetMaxUsedRegisterMappings() const {
+    return DU->GetMaxUsedRegisterMappings();
+  }
+  unsigned GetDispatchWidth() const { return DU->GetDispatchWidth(); }
+
+  const MCSubtargetInfo &GetSTI() const { return STI; }
+  const MCSchedModel &GetSchedModel() const { return STI.getSchedModel(); }
+  const ArrayRef<uint64_t> GetProcResourceMasks() const {
+    return HWS->GetProcResourceMasks();
+  }
+
+  double GetRThroughput(const InstrDesc &ID) const {
+    return HWS->GetRThroughput(ID);
+  }
+  void GetBuffersUsage(std::vector<BufferUsageEntry> &Usage) const {
+    return HWS->GetBuffersUsage(Usage);
+  }
+
+  unsigned GetNumRATStalls() const { return DU->GetNumRATStalls(); }
+  unsigned GetNumRCUStalls() const { return DU->GetNumRCUStalls(); }
+  unsigned GetNumSQStalls() const { return DU->GetNumSQStalls(); }
+  unsigned GetNumLDQStalls() const { return DU->GetNumLDQStalls(); }
+  unsigned GetNumSTQStalls() const { return DU->GetNumSTQStalls(); }
+  unsigned GetNumDispatchGroupStalls() const {
+    return DU->GetNumDispatchGroupStalls();
+  }
+
+  const MCInst &GetMCInstFromIndex(unsigned Index) const {
+    return SM->GetMCInstFromIndex(Index);
+  }
+
+  const InstrDesc &GetInstrDesc(const MCInst &Inst) const {
+    return IB->GetOrCreateInstrDesc(STI, Inst);
+  }
+
+  const SourceMgr &GetSourceMgr() const { return *SM; }
+
+  void AddEventListener(HWEventListener *Listener);
+  void NotifyCycleBegin(unsigned Cycle);
+  void NotifyInstructionDispatched(unsigned Index);
+  void NotifyInstructionReady(unsigned Index);
+  void NotifyInstructionIssued(
+      unsigned Index, const ArrayRef<std::pair<ResourceRef, unsigned>> &Used);
+  void NotifyInstructionExecuted(unsigned Index);
+  void NotifyResourceAvailable(const ResourceRef &RR);
+  void NotifyInstructionRetired(unsigned Index);
+  void NotifyCycleEnd(unsigned Cycle);
+};
+
+} // namespace mca
+
+#endif
Index: tools/llvm-mca/Backend.cpp
===================================================================
--- tools/llvm-mca/Backend.cpp
+++ tools/llvm-mca/Backend.cpp
@@ -0,0 +1,130 @@
+//===--------------------- Backend.cpp --------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// Implementation of class Backend which emulates an hardware OoO backend.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Backend.h"
+#include "HWEventListener.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/Support/Debug.h"
+
+namespace mca {
+
+#define DEBUG_TYPE "llvm-mca"
+
+void Backend::AddEventListener(HWEventListener *Listener) {
+  if (Listener)
+    Listeners.insert(Listener);
+}
+
+void Backend::RunCycle(unsigned Cycle) {
+  NotifyCycleBegin(Cycle);
+
+  if (!SM->HasNext()) {
+    NotifyCycleEnd(Cycle);
+    return;
+  }
+
+  InstRef IR = SM->PeekNext();
+  const InstrDesc *Desc =
+      &IB->GetOrCreateInstrDesc(STI, *IR.second);
+  while (DU->IsAvailable(Desc->NumMicroOps) && DU->CanDispatch(*Desc)) {
+    Instruction *NewIS = IB->CreateInstruction(STI, *DU, IR.first, *IR.second);
+    Instructions[IR.first] = std::unique_ptr<Instruction>(NewIS);
+    NewIS->SetRCUTokenID(DU->Dispatch(IR.first, NewIS));
+
+    // If this is a zero latency instruction, then we don't need to dispatch
+    // it. Instead, we can mark it as executed.
+    if (NewIS->IsZeroLatency())
+      NotifyInstructionExecuted(IR.first);
+
+    // Check if we have dispatched all the instructions.
+    SM->UpdateNext();
+    if (!SM->HasNext())
+      break;
+
+    // Prepare for the next round.
+    IR = SM->PeekNext();
+    Desc = &IB->GetOrCreateInstrDesc(STI, *IR.second);
+  }
+
+  NotifyCycleEnd(Cycle);
+}
+
+void Backend::NotifyCycleBegin(unsigned Cycle) {
+  DEBUG(dbgs() << "[E] Cycle begin: " << Cycle << '\n');
+  for (HWEventListener *Listener : Listeners)
+    Listener->OnCycleBegin(Cycle);
+
+  DU->CycleEvent(Cycle);
+  HWS->CycleEvent(Cycle);
+}
+
+void Backend::NotifyInstructionDispatched(unsigned Index) {
+  DEBUG(dbgs() << "[E] Instruction Dispatched: " << Index << '\n');
+  for (HWEventListener *Listener : Listeners)
+    Listener->OnInstructionDispatched(Index);
+}
+
+void Backend::NotifyInstructionReady(unsigned Index) {
+  DEBUG(dbgs() << "[E] Instruction Ready: " << Index << '\n');
+  for (HWEventListener *Listener : Listeners)
+    Listener->OnInstructionReady(Index);
+}
+
+void Backend::NotifyInstructionIssued(
+    unsigned Index, const ArrayRef<std::pair<ResourceRef, unsigned>> &Used) {
+  DEBUG(dbgs() << "[E] Instruction Issued: " << Index << '\n';
+        for (const auto &Resource
+             : Used) {
+          dbgs() << "[E] Resource Used: [" << Resource.first.first << '.'
+                 << Resource.first.second << "]\n";
+          dbgs() << "           cycles: " << Resource.second << '\n';
+        });
+
+  for (HWEventListener *Listener : Listeners)
+    Listener->OnInstructionIssued(Index, Used);
+}
+
+void Backend::NotifyInstructionExecuted(unsigned Index) {
+  DEBUG(dbgs() << "[E] Instruction Executed: " << Index << '\n');
+  for (HWEventListener *Listener : Listeners)
+    Listener->OnInstructionExecuted(Index);
+
+  const Instruction &IS = *Instructions[Index];
+  DU->OnInstructionExecuted(IS.GetRCUTokenID());
+}
+
+void Backend::NotifyInstructionRetired(unsigned Index) {
+  DEBUG(dbgs() << "[E] Instruction Retired: " << Index << '\n');
+  for (HWEventListener *Listener : Listeners)
+    Listener->OnInstructionRetired(Index);
+
+  const Instruction &IS = *Instructions[Index];
+  DU->InvalidateRegisterMappings(IS);
+  Instructions.erase(Index);
+}
+
+void Backend::NotifyResourceAvailable(const ResourceRef &RR) {
+  DEBUG(dbgs() << "[E] Resource Available: [" << RR.first << '.' << RR.second
+               << "]\n");
+  for (HWEventListener *Listener : Listeners)
+    Listener->OnResourceAvailable(RR);
+}
+
+void Backend::NotifyCycleEnd(unsigned Cycle) {
+  DEBUG(dbgs() << "[E] Cycle end: " << Cycle << "\n\n");
+  for (HWEventListener *Listener : Listeners)
+    Listener->OnCycleEnd(Cycle);
+}
+
+} // namespace mca.
Index: tools/llvm-mca/BackendPrinter.h
===================================================================
--- tools/llvm-mca/BackendPrinter.h
+++ tools/llvm-mca/BackendPrinter.h
@@ -0,0 +1,104 @@
+//===--------------------- BackendPrinter.h ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements class BackendPrinter.
+/// BackendPrinter is able to collect statistics related to the code executed
+/// by the Backend class. Information is then printed out with the help of
+/// a MCInstPrinter (to pretty print MCInst objects) and other helper classes.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_BACKENDPRINTER_H
+#define LLVM_TOOLS_LLVM_MCA_BACKENDPRINTER_H
+
+#include "Backend.h"
+#include "BackendStatistics.h"
+#include "ResourcePressureView.h"
+#include "TimelineView.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/ToolOutputFile.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace mca {
+
+class ResourcePressureView;
+class TimelineView;
+
+/// \brief A printer class that knows how to collects statistics on the
+/// code analyzed by the llvm-mca tool.
+///
+/// This class knows how to print out the analysis information collected
+/// during the execution of the code. Internally, it delegates to other
+/// classes the task of printing out timeline information as well as
+/// resource pressure.
+class BackendPrinter {
+  Backend &B;
+  bool EnableVerboseOutput;
+
+  std::unique_ptr<MCInstPrinter> MCIP;
+  std::unique_ptr<ToolOutputFile> File;
+
+  std::unique_ptr<ResourcePressureView> RPV;
+  std::unique_ptr<TimelineView> TV;
+  std::unique_ptr<BackendStatistics> BS;
+
+  using Histogram = std::map<unsigned, unsigned>;
+  void printDUStatistics(const Histogram &Stats, unsigned Cycles) const;
+  void printDispatchStalls(unsigned RATStalls, unsigned RCUStalls,
+                           unsigned SQStalls, unsigned LDQStalls,
+                           unsigned STQStalls, unsigned DGStalls) const;
+  void printRATStatistics(unsigned Mappings, unsigned MaxUsedMappings) const;
+  void printRCUStatistics(const Histogram &Histogram, unsigned Cycles) const;
+  void printIssuePerCycle(const Histogram &IssuePerCycle,
+                          unsigned TotalCycles) const;
+  void printSchedulerUsage(const MCSchedModel &SM,
+                           const ArrayRef<BufferUsageEntry> &Usage) const;
+  void printGeneralStatistics(unsigned Iterations, unsigned Cycles,
+                              unsigned Instructions,
+                              unsigned DispatchWidth) const;
+  void printInstructionInfo() const;
+
+  std::unique_ptr<ToolOutputFile> GetOutputStream(std::string OutputFile);
+  void initialize(std::string OputputFileName);
+
+public:
+  BackendPrinter(Backend &backend, std::string OutputFileName,
+                 std::unique_ptr<MCInstPrinter> IP, bool EnableVerbose)
+      : B(backend), EnableVerboseOutput(EnableVerbose), MCIP(std::move(IP)) {
+    initialize(OutputFileName);
+ }
+
+  ~BackendPrinter() {
+    if (File)
+      File->keep();
+  }
+
+  bool IsFileValid() const { return File.get(); }
+  raw_ostream &getOStream() const {
+    assert(IsFileValid());
+    return File->os();
+  }
+
+  MCInstPrinter &GetMCInstPrinter() const { return *MCIP; }
+
+  void AddResourcePressureView();
+  void AddTimelineView(unsigned MaxIterations = 3, unsigned MaxCycles = 80);
+
+  void print() const;
+};
+
+} // namespace mca
+
+#endif
Index: tools/llvm-mca/BackendPrinter.cpp
===================================================================
--- tools/llvm-mca/BackendPrinter.cpp
+++ tools/llvm-mca/BackendPrinter.cpp
@@ -0,0 +1,207 @@
+//===--------------------- BackendPrinter.cpp -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements the BackendPrinter interface.
+///
+//===----------------------------------------------------------------------===//
+
+#include "BackendPrinter.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+
+namespace mca {
+
+std::unique_ptr<ToolOutputFile>
+BackendPrinter::GetOutputStream(std::string OutputFile) {
+  if (OutputFile == "")
+    OutputFile = "-";
+  std::error_code EC;
+  auto Out = llvm::make_unique<ToolOutputFile>(OutputFile, EC, sys::fs::F_None);
+  if (!EC)
+    return Out;
+  errs() << EC.message() << '\n';
+  return nullptr;
+}
+
+void BackendPrinter::printGeneralStatistics(unsigned Iterations,
+                                            unsigned Cycles,
+                                            unsigned Instructions,
+                                            unsigned DispatchWidth) const {
+  unsigned TotalInstructions = Instructions * Iterations;
+  double IPC = (double)TotalInstructions / Cycles;
+
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+  TempStream << "Iterations:     " << Iterations;
+  TempStream << "\nInstructions:   " << TotalInstructions;
+  TempStream << "\nTotal Cycles:   " << Cycles;
+  TempStream << "\nDispatch Width: " << DispatchWidth;
+  TempStream << "\nIPC:            " << format("%.2f", IPC) << '\n';
+  TempStream.flush();
+  File->os() << Buffer;
+}
+
+void BackendPrinter::printRATStatistics(unsigned TotalMappings,
+                                        unsigned MaxUsedMappings) const {
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+  TempStream << "\n\nRegister Alias Table:";
+  TempStream << "\nTotal number of mappings created: " << TotalMappings;
+  TempStream << "\nMax number of mappings used:      " << MaxUsedMappings
+             << '\n';
+  TempStream.flush();
+  File->os() << Buffer;
+}
+
+void BackendPrinter::printDispatchStalls(unsigned RATStalls, unsigned RCUStalls,
+                                         unsigned SCHEDQStalls,
+                                         unsigned LDQStalls, unsigned STQStalls,
+                                         unsigned DGStalls) const {
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+  TempStream << "\n\nDynamic Dispatch Stall Cycles:\n";
+  TempStream << "RAT     - Register unavailable:                      "
+             << RATStalls;
+  TempStream << "\nRCU     - Retire tokens unavailable:                 "
+             << RCUStalls;
+  TempStream << "\nSCHEDQ  - Scheduler full:                            "
+             << SCHEDQStalls;
+  TempStream << "\nLQ      - Load queue full:                           "
+             << LDQStalls;
+  TempStream << "\nSQ      - Store queue full:                          "
+             << STQStalls;
+  TempStream << "\nGROUP   - Static restrictions on the dispatch group: "
+             << DGStalls;
+  TempStream << '\n';
+  TempStream.flush();
+  File->os() << Buffer;
+}
+
+void BackendPrinter::printSchedulerUsage(
+    const MCSchedModel &SM, const ArrayRef<BufferUsageEntry> &Usage) const {
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+  TempStream << "\n\nScheduler's queue usage:\n";
+  const ArrayRef<uint64_t> ResourceMasks = B.GetProcResourceMasks();
+  for (unsigned i = 0; i < SM.getNumProcResourceKinds(); ++i) {
+    const MCProcResourceDesc &ProcResource = *SM.getProcResource(i);
+    if (!ProcResource.BufferSize)
+      continue;
+
+    for (const BufferUsageEntry &Entry : Usage)
+      if (ResourceMasks[i] == Entry.first)
+        TempStream << ProcResource.Name << ",  " << Entry.second << '/'
+                   << ProcResource.BufferSize << '\n';
+  }
+
+  TempStream.flush();
+  File->os() << Buffer;
+}
+
+void BackendPrinter::printInstructionInfo() const {
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+
+  TempStream << "\n\nInstruction Info:\n";
+  TempStream << "[1]: #uOps\n[2]: Latency\n[3]: RThroughput\n"
+             << "[4]: MayLoad\n[5]: MayStore\n[6]: HasSideEffects\n\n";
+
+  TempStream << "[1]    [2]    [3]    [4]    [5]    [6]\tInstructions:\n";
+  for (unsigned Idx = 0; Idx < B.GetNumInstructions(); ++Idx) {
+    const MCInst &Inst = B.GetMCInstFromIndex(Idx);
+    const InstrDesc &ID = B.GetInstrDesc(Inst);
+    unsigned NumMicroOpcodes = ID.NumMicroOps;
+    unsigned Latency = ID.MaxLatency;
+    double RThroughput = B.GetRThroughput(ID);
+    TempStream << ' ' << NumMicroOpcodes << "    ";
+    if (NumMicroOpcodes < 10)
+      TempStream << "  ";
+    else if (NumMicroOpcodes < 100)
+      TempStream << ' ';
+    TempStream << Latency << "   ";
+    if (Latency < 10.0)
+      TempStream << "  ";
+    else if (Latency < 100.0)
+      TempStream << ' ';
+    if (RThroughput) {
+      TempStream << format("%.2f", RThroughput) << ' ';
+      if (RThroughput < 10.0)
+        TempStream << "  ";
+      else if (RThroughput < 100.0)
+        TempStream << ' ';
+    } else {
+      TempStream << " -     ";
+    }
+    TempStream << (ID.MayLoad ? " *     " : "       ");
+    TempStream << (ID.MayStore ? " *     " : "       ");
+    TempStream << (ID.HasSideEffects ? " * " : "   ");
+    MCIP->printInst(&Inst, TempStream, "", B.GetSTI());
+    TempStream << '\n';
+  }
+
+  TempStream.flush();
+  File->os() << Buffer;
+}
+
+void BackendPrinter::print() const {
+  assert(IsFileValid());
+  unsigned Cycles = B.GetNumCycles();
+  printGeneralStatistics(B.GetNumIterations(), Cycles, B.GetNumInstructions(),
+                         B.GetDispatchWidth());
+  if (EnableVerboseOutput) {
+    printDispatchStalls(B.GetNumRATStalls(), B.GetNumRCUStalls(),
+                        B.GetNumSQStalls(), B.GetNumLDQStalls(),
+                        B.GetNumSTQStalls(), B.GetNumDispatchGroupStalls());
+    printRATStatistics(B.GetTotalRegisterMappingsCreated(),
+                       B.GetMaxUsedRegisterMappings());
+    BS->printHistograms(File->os());
+
+    std::vector<BufferUsageEntry> Usage;
+    B.GetBuffersUsage(Usage);
+    printSchedulerUsage(B.GetSchedModel(), Usage);
+  }
+
+  if (RPV) {
+    RPV->printResourcePressure(getOStream(), Cycles);
+    printInstructionInfo();
+  }
+
+  if (TV) {
+    TV->printTimeline(getOStream());
+    TV->printAverageWaitTimes(getOStream());
+  }
+}
+
+void BackendPrinter::AddResourcePressureView() {
+  if (!RPV) {
+    RPV = llvm::make_unique<ResourcePressureView>(
+        B.GetSTI(), *MCIP, B.GetSourceMgr(), B.GetProcResourceMasks());
+    B.AddEventListener(RPV.get());
+  }
+}
+
+void BackendPrinter::AddTimelineView(unsigned MaxIterations,
+                                     unsigned MaxCycles) {
+  if (!TV) {
+    TV = llvm::make_unique<TimelineView>(B.GetSTI(), *MCIP, B.GetSourceMgr(),
+                                         MaxIterations, MaxCycles);
+    B.AddEventListener(TV.get());
+  }
+}
+
+void BackendPrinter::initialize(std::string OutputFileName) {
+  File = std::move(GetOutputStream(OutputFileName));
+  MCIP->setPrintImmHex(false);
+  if (EnableVerboseOutput) {
+    BS = std::move(llvm::make_unique<BackendStatistics>());
+    B.AddEventListener(BS.get());
+  }
+}
+
+} // namespace mca.
Index: tools/llvm-mca/BackendStatistics.h
===================================================================
--- tools/llvm-mca/BackendStatistics.h
+++ tools/llvm-mca/BackendStatistics.h
@@ -0,0 +1,95 @@
+//===--------------------- BackendStatistics.h ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements a printer class for printing generic Backend
+/// statistics related to the dispatch logic, scheduler and retire unit.
+///
+/// Example:
+/// ========
+///
+/// Dispatch Logic - number of cycles where we saw N instructions dispatched:
+/// [# dispatched], [# cycles]
+///  0,              15  (11.5%)
+///  5,              4  (3.1%)
+///
+/// Schedulers - number of cycles where we saw N instructions issued:
+/// [# issued], [# cycles]
+///  0,          7  (5.4%)
+///  1,          4  (3.1%)
+///  2,          8  (6.2%)
+///
+/// Retire Control Unit - number of cycles where we saw N instructions retired:
+/// [# retired], [# cycles]
+///  0,           9  (6.9%)
+///  1,           6  (4.6%)
+///  2,           1  (0.8%)
+///  4,           3  (2.3%)
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_BACKENDSTATISTICS_H
+#define LLVM_TOOLS_LLVM_MCA_BACKENDSTATISTICS_H
+
+#include "HWEventListener.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+
+namespace mca {
+
+class BackendStatistics : public HWEventListener {
+  using Histogram = std::map<unsigned, unsigned>;
+  Histogram DispatchGroupSizePerCycle;
+  Histogram RetiredPerCycle;
+  Histogram IssuedPerCycle;
+
+  unsigned NumDispatched;
+  unsigned NumIssued;
+  unsigned NumRetired;
+  unsigned NumCycles;
+
+  void updateHistograms() {
+    DispatchGroupSizePerCycle[NumDispatched]++;
+    IssuedPerCycle[NumIssued]++;
+    RetiredPerCycle[NumRetired]++;
+    NumDispatched = 0;
+    NumIssued = 0;
+    NumRetired = 0;
+  }
+
+  void printRetireUnitStatistics(llvm::raw_ostream &OS) const;
+  void printDispatchUnitStatistics(llvm::raw_ostream &OS) const;
+  void printSchedulerStatistics(llvm::raw_ostream &OS) const;
+
+public:
+  BackendStatistics() : NumDispatched(0), NumIssued(0), NumRetired(0) {}
+
+  void OnInstructionDispatched(unsigned Index) override { NumDispatched++; }
+  void
+  OnInstructionIssued(unsigned Index,
+                      const llvm::ArrayRef<std::pair<ResourceRef, unsigned>>
+                          & /* unused */) override {
+    NumIssued++;
+  }
+  void OnInstructionRetired(unsigned Index) override { NumRetired++; }
+
+  void OnCycleBegin(unsigned Cycle) override { NumCycles++; }
+
+  void OnCycleEnd(unsigned Cycle) override { updateHistograms(); }
+
+  void printHistograms(llvm::raw_ostream &OS) {
+    printDispatchUnitStatistics(OS);
+    printSchedulerStatistics(OS);
+    printRetireUnitStatistics(OS);
+  }
+};
+
+} // namespace mca
+
+#endif
Index: tools/llvm-mca/BackendStatistics.cpp
===================================================================
--- tools/llvm-mca/BackendStatistics.cpp
+++ tools/llvm-mca/BackendStatistics.cpp
@@ -0,0 +1,79 @@
+//===--------------------- BackendStatistics.cpp ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// 
+/// Functionalities used by the BackendPrinter to print out histograms
+/// related to number of {dispatch/issue/retire} per number of cycles.
+///
+//===----------------------------------------------------------------------===//
+
+#include "BackendStatistics.h"
+#include "llvm/Support/Format.h"
+
+using namespace llvm;
+
+namespace mca {
+
+void BackendStatistics::printRetireUnitStatistics(llvm::raw_ostream &OS) const {
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+  TempStream << "\n\nRetire Control Unit - "
+             << "number of cycles where we saw N instructions retired:\n";
+  TempStream << "[# retired], [# cycles]\n";
+
+  for (const auto &Entry : RetiredPerCycle) {
+    TempStream << " " << Entry.first;
+    if (Entry.first < 10)
+      TempStream << ",           ";
+    else
+      TempStream << ",          ";
+    TempStream << Entry.second << "  ("
+               << format("%.1f", ((double)Entry.second / NumCycles) * 100.0)
+               << "%)\n";
+  }
+
+  TempStream.flush();
+  OS << Buffer;
+}
+
+void BackendStatistics::printDispatchUnitStatistics(llvm::raw_ostream &OS) const {
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+  TempStream << "\n\nDispatch Logic - "
+             << "number of cycles where we saw N instructions dispatched:\n";
+  TempStream << "[# dispatched], [# cycles]\n";
+  for (const auto &Entry : DispatchGroupSizePerCycle) {
+    TempStream << " " << Entry.first << ",              " << Entry.second
+               << "  ("
+               << format("%.1f", ((double)Entry.second / NumCycles) * 100.0)
+               << "%)\n";
+  }
+
+  TempStream.flush();
+  OS << Buffer;
+}
+
+void BackendStatistics::printSchedulerStatistics(llvm::raw_ostream &OS) const {
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+  TempStream << "\n\nSchedulers - number of cycles where we saw N instructions "
+                "issued:\n";
+  TempStream << "[# issued], [# cycles]\n";
+  for (const auto &Entry : IssuedPerCycle) {
+    TempStream << " " << Entry.first << ",          " << Entry.second << "  ("
+               << format("%.1f", ((double)Entry.second / NumCycles) * 100)
+               << "%)\n";
+  }
+
+  TempStream.flush();
+  OS << Buffer;
+}
+
+} // namespace mca
+
Index: tools/llvm-mca/CMakeLists.txt
===================================================================
--- tools/llvm-mca/CMakeLists.txt
+++ tools/llvm-mca/CMakeLists.txt
@@ -0,0 +1,24 @@
+set(LLVM_LINK_COMPONENTS
+  AllTargetsAsmPrinters
+  AllTargetsAsmParsers
+  AllTargetsDescs
+  AllTargetsDisassemblers
+  AllTargetsInfos
+  MC
+  MCParser
+  Support
+  )
+
+add_llvm_tool(llvm-mca
+  Backend.cpp
+  BackendPrinter.cpp
+  BackendStatistics.cpp
+  Dispatch.cpp
+  InstrBuilder.cpp
+  Instruction.cpp
+  LSUnit.cpp
+  llvm-mca.cpp
+  ResourcePressureView.cpp
+  Scheduler.cpp
+  TimelineView.cpp
+  )
Index: tools/llvm-mca/Dispatch.h
===================================================================
--- tools/llvm-mca/Dispatch.h
+++ tools/llvm-mca/Dispatch.h
@@ -0,0 +1,315 @@
+//===----------------------- Dispatch.h -------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements classes that are used to model register files,
+/// reorder buffers and the hardware dispatch logic.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_DISPATCH_H
+#define LLVM_TOOLS_LLVM_MCA_DISPATCH_H
+
+#include "Instruction.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include <map>
+
+namespace mca {
+
+using namespace llvm;
+
+class WriteState;
+class DispatchUnit;
+class Scheduler;
+class Backend;
+
+/// \brief Keeps track of register definitions.
+///
+/// This class tracks register definitions, and performs register renaming
+/// to break anti dependencies.
+/// By default, there is no limit in the number of register aliases which
+/// can be created for the purpose of register renaming. However, users can
+/// specify at object construction time a limit in the number of temporary
+/// registers which can be used by the register renaming logic.
+class RegisterFile {
+  const MCRegisterInfo &MRI;
+  // Currently used mappings and maximum used mappings.
+  // These are to generate statistics only.
+  unsigned NumUsedMappings;
+  unsigned MaxUsedMappings;
+  // Total number of mappings created over time.
+  unsigned TotalMappingsCreated;
+
+  // The maximum number of register aliases which can be used by the
+  // register renamer. Defaut value for this field is zero.
+  // A value of zero for this field means that there is no limit in the
+  // amount of register mappings which can be created. That is equivalent
+  // to having a theoretically infinite number of temporary registers.
+  unsigned TotalMappings;
+
+  // This map contains an entry for every physical register.
+  // A register index is used as a key value to access a WriteState.
+  // This is how we track RAW dependencies for dispatched
+  // instructions. For every register, we track the last seen write only.
+  // This assumes that all writes fully update both super and sub registers.
+  // We need a flag in MCInstrDesc to check if a write also updates super
+  // registers. We can then have a extra tablegen flag to set for instructions.
+  // This is a separate patch on its own.
+  std::vector<WriteState *> RegisterMappings;
+  // Assumptions are:
+  //  a) a false dependencies is always removed by the register renamer.
+  //  b) the register renamer can create an "infinite" number of mappings.
+  // Since we track the number of mappings created, in future we may
+  // introduce constraints on the number of mappings that can be created.
+  // For example, the maximum number of registers that are available for
+  // register renaming purposes may default to the size of the register file.
+
+  // In future, we can extend this design to allow multiple register files, and
+  // apply different restrictions on the register mappings and the number of
+  // temporary registers used by mappings.
+
+public:
+  RegisterFile(const MCRegisterInfo &mri, unsigned Mappings = 0)
+      : MRI(mri), NumUsedMappings(0), MaxUsedMappings(0),
+        TotalMappingsCreated(0), TotalMappings(Mappings),
+        RegisterMappings(MRI.getNumRegs(), nullptr) {}
+
+  // Creates a new mapping for RegID.
+  void AddRegisterMapping(WriteState *WS);
+  void InvalidateRegisterMapping(const WriteState &WS);
+
+  bool IsAvailable(unsigned NumRegWrites);
+  void CollectWrites(SmallVector<WriteState *, 4> &Writes,
+                     unsigned RegID) const;
+  void UpdateOnRead(ReadState &RS, unsigned RegID);
+  unsigned GetMaxUsedRegisterMappings() const { return MaxUsedMappings; }
+  unsigned GetTotalRegisterMappingsCreated() const {
+    return TotalMappingsCreated;
+  }
+
+#ifndef NDEBUG
+  void dump() const;
+#endif
+};
+
+/// \brief tracks which instructions are in-flight (i.e. dispatched but not
+/// retired) in the OoO backend.
+///
+/// This class checks on every cycle if/which instructions can be retired.
+/// Instructions are retired in program order.
+/// In the event of instruction retired, the DispatchUnit object that owns
+/// this RetireControlUnit gets notified.
+/// On instruction retired, register updates are all architecturally
+/// committed, and any temporary registers originally allocated for the
+/// retired instruction are freed.
+struct RetireControlUnit {
+  // A "token" (object of class RUToken) is created by the retire unit for every
+  // instruction dispatched to the schedulers.  Flag 'Executed' is used to
+  // quickly check if an instruction has reached the write-back stage.  A token
+  // also carries information related to the number of entries consumed by the
+  // instruction in the reorder buffer. The idea is that those entries will
+  // become available again once the instruction is retired.  On every cycle,
+  // the RCU (Retire Control Unit) scans every token starting to search for
+  // instructions that are ready to retire.  retired. Instructions are retired
+  // in program order. Only 'Executed' instructions are eligible for retire.
+  // Note that the size of the reorder buffer is defined by the scheduling model
+  // via field 'NumMicroOpBufferSize'.
+  struct RUToken {
+    unsigned Index;    // Instruction index.
+    unsigned NumSlots; // Slots reserved to this instruction.
+    bool Executed;     // True if the instruction is past the WB stage.
+  };
+
+private:
+  unsigned NextAvailableSlotIdx;
+  unsigned CurrentInstructionSlotIdx;
+  unsigned AvailableSlots;
+  unsigned MaxRetirePerCycle; // 0 means no limit.
+  std::vector<RUToken> Queue;
+  DispatchUnit *Owner;
+
+public:
+  RetireControlUnit(unsigned NumSlots, unsigned RPC, DispatchUnit *DU)
+      : NextAvailableSlotIdx(0), CurrentInstructionSlotIdx(0),
+        AvailableSlots(NumSlots), MaxRetirePerCycle(RPC), Owner(DU) {
+    assert(NumSlots && "Expected at least one slot!");
+    Queue.resize(NumSlots);
+  }
+
+  bool IsFull() const { return !AvailableSlots; }
+  bool IsEmpty() const { return AvailableSlots == Queue.size(); }
+  bool IsAvailable(unsigned Quantity = 1) const {
+    // Some instructions may declare a number of uOps which exceedes the size
+    // of the reorder buffer. To avoid problems, cap the amount of slots to
+    // the size of the reorder buffer.
+    Quantity = std::min(Quantity, static_cast<unsigned>(Queue.size()));
+    return AvailableSlots >= Quantity;
+  }
+
+  // Reserves a number of slots, and returns a new token.
+  unsigned ReserveSlot(unsigned Index, unsigned NumMicroOps);
+  // Returns the number of instructions retired in this cycle.
+  void CycleEvent();
+
+  void OnInstructionExecuted(unsigned TokenID);
+
+#ifndef NDEBUG
+  void dump() const;
+#endif
+};
+
+// \brief Implements the hardware dispatch logic.
+//
+// This class is responsible for the dispatch stage, in which instructions are
+// dispatched in groups to the Scheduler.  An instruction can be dispatched if
+// functional units are available.
+// To be more specific, an instruction can be dispatched to the Scheduler if:
+//  1) There are enough entries in the reorder buffer (implemented by class
+//     RetireControlUnit) to accomodate all opcodes.
+//  2) There are enough temporaries to rename output register operands.
+//  3) There are enough entries available in the used buffered resource(s).
+//
+// The number of micro opcodes that can be dispatched in one cycle is limited by
+// the value of field 'DispatchWidth'. A "dynamic dispatch stall" occurs when
+// processor resources are not available (i.e. at least one of the
+// abovementioned checks fails). Dispatch stall events are counted during the
+// entire execution of the code, and displayed by the performance report when
+// flag '-verbose' is specified.
+//
+// If the number of micro opcodes of an instruction is bigger than
+// DispatchWidth, then it can only be dispatched at the beginning of one cycle.
+// The DispatchUnit will still have to wait for a number of cycles (depending on
+// the DispatchWidth and the number of micro opcodes) before it can serve other
+// instructions.
+class DispatchUnit {
+  unsigned DispatchWidth;
+  unsigned AvailableEntries;
+  unsigned CarryOver;
+  Scheduler *SC;
+
+  std::unique_ptr<RegisterFile> RAT;
+  std::unique_ptr<RetireControlUnit> RCU;
+  Backend *Owner;
+
+  /// Dispatch stall event identifiers.
+  ///
+  /// The naming convention is:
+  /// * Each event name starts with the "DS_" prefix
+  /// * For dynamic dispatch stalls, the "DS_" prefix is always followed by the
+  ///   name of the unavailable resource/functional unit (example: RAT)
+  /// * The last substring captures the reason why the event occurred (example:
+  ///   REG_UNAVAILABLE means that register renaming couldn't find enough spare
+  ///   temporary registers in the register file).
+  ///
+  /// List of acronyms used for processor resoures:
+  /// RAT - Register Alias Table (used by the register renaming logic)
+  /// RCU - Retire Control Unit
+  /// SQ  - Scheduler's Queue
+  /// LDQ - Load Queue
+  /// STQ - Store Queue
+  enum {
+    DS_RAT_REG_UNAVAILABLE,
+    DS_RCU_TOKEN_UNAVAILABLE,
+    DS_SQ_TOKEN_UNAVAILABLE,
+    DS_LDQ_TOKEN_UNAVAILABLE,
+    DS_STQ_TOKEN_UNAVAILABLE,
+    DS_DISPATCH_GROUP_RESTRICTION,
+    DS_LAST
+  };
+
+  // The DispatchUnit track dispatch stall events caused by unavailable
+  // of hardware resources. Events are classified based on the stall kind;
+  // so we have a counter for every source of dispatch stall. Counters are
+  // stored into a vector `DispatchStall` which is always of size DS_LAST.
+  std::vector<unsigned> DispatchStalls;
+
+  bool CheckRAT(const InstrDesc &Desc);
+  bool CheckRCU(const InstrDesc &Desc);
+  bool CheckScheduler(const InstrDesc &Desc);
+
+  void NotifyInstructionDispatched(unsigned IID);
+
+public:
+  DispatchUnit(Backend *B, const MCRegisterInfo &MRI,
+               unsigned MicroOpBufferSize, unsigned RegisterFileSize,
+               unsigned MaxRetirePerCycle, unsigned MaxDispatchWidth,
+               Scheduler *Sched)
+      : DispatchWidth(MaxDispatchWidth), AvailableEntries(MaxDispatchWidth),
+        CarryOver(0U), SC(Sched),
+        RAT(llvm::make_unique<RegisterFile>(MRI, RegisterFileSize)),
+        RCU(llvm::make_unique<RetireControlUnit>(MicroOpBufferSize,
+                                                 MaxRetirePerCycle, this)),
+        Owner(B), DispatchStalls(DS_LAST, 0) {}
+
+  unsigned GetDispatchWidth() const { return DispatchWidth; }
+
+  bool IsAvailable(unsigned NumEntries) const {
+    return NumEntries <= AvailableEntries || AvailableEntries == DispatchWidth;
+  }
+
+  bool IsRCUEmpty() const { return RCU->IsEmpty(); }
+
+  bool CanDispatch(const InstrDesc &Desc) {
+    assert(IsAvailable(Desc.NumMicroOps));
+    return CheckRCU(Desc) && CheckRAT(Desc) && CheckScheduler(Desc);
+  }
+
+  unsigned Dispatch(unsigned IID, Instruction *NewInst);
+
+  void CollectWrites(SmallVector<WriteState *, 4> &Vec, unsigned RegID) const {
+    return RAT->CollectWrites(Vec, RegID);
+  }
+  unsigned GetNumRATStalls() const {
+    return DispatchStalls[DS_RAT_REG_UNAVAILABLE];
+  }
+  unsigned GetNumRCUStalls() const {
+    return DispatchStalls[DS_RCU_TOKEN_UNAVAILABLE];
+  }
+  unsigned GetNumSQStalls() const {
+    return DispatchStalls[DS_SQ_TOKEN_UNAVAILABLE];
+  }
+  unsigned GetNumLDQStalls() const {
+    return DispatchStalls[DS_LDQ_TOKEN_UNAVAILABLE];
+  }
+  unsigned GetNumSTQStalls() const {
+    return DispatchStalls[DS_STQ_TOKEN_UNAVAILABLE];
+  }
+  unsigned GetNumDispatchGroupStalls() const {
+    return DispatchStalls[DS_DISPATCH_GROUP_RESTRICTION];
+  }
+  unsigned GetMaxUsedRegisterMappings() const {
+    return RAT->GetMaxUsedRegisterMappings();
+  }
+  unsigned GetTotalRegisterMappingsCreated() const {
+    return RAT->GetTotalRegisterMappingsCreated();
+  }
+  void AddNewRegisterMapping(WriteState *WS) { RAT->AddRegisterMapping(WS); }
+
+  void CycleEvent(unsigned Cycle) {
+    RCU->CycleEvent();
+    AvailableEntries =
+        CarryOver >= DispatchWidth ? 0 : DispatchWidth - CarryOver;
+    CarryOver = CarryOver >= DispatchWidth ? CarryOver - DispatchWidth : 0U;
+  }
+
+  void NotifyInstructionRetired(unsigned Index);
+
+  void OnInstructionExecuted(unsigned TokenID) {
+    return RCU->OnInstructionExecuted(TokenID);
+  }
+
+  void InvalidateRegisterMappings(const Instruction &Inst);
+#ifndef NDEBUG
+  void dump() const;
+#endif
+};
+
+} // namespace mca
+
+#endif
Index: tools/llvm-mca/Dispatch.cpp
===================================================================
--- tools/llvm-mca/Dispatch.cpp
+++ tools/llvm-mca/Dispatch.cpp
@@ -0,0 +1,267 @@
+//===--------------------- Dispatch.cpp -------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements methods declared by class RegisterFile, DispatchUnit and
+/// RetireControlUnit.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Dispatch.h"
+#include "Scheduler.h"
+#include "Backend.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace mca {
+
+// Creates a new mapping for RegID.
+void RegisterFile::AddRegisterMapping(WriteState *WS) {
+  unsigned RegID = WS->GetRegisterID();
+  assert(RegID && "Adding an invalid register definition?");
+
+  RegisterMappings[RegID] = WS;
+  for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I)
+    RegisterMappings[*I] = WS;
+  if (MaxUsedMappings == NumUsedMappings)
+    MaxUsedMappings++;
+  NumUsedMappings++;
+  TotalMappingsCreated++;
+  // If this is a partial update, then we are done.
+  if (!WS->FullyUpdatesSuperRegs())
+    return;
+
+  for (MCSuperRegIterator I(RegID, &MRI); I.isValid(); ++I)
+    RegisterMappings[*I] = WS;
+}
+
+void RegisterFile::InvalidateRegisterMapping(const WriteState &WS) {
+  unsigned RegID = WS.GetRegisterID();
+  bool ShouldInvalidateSuperRegs = WS.FullyUpdatesSuperRegs();
+
+  assert(RegID != 0 && "Invalidating an already invalid register?");
+  assert(WS.GetCyclesLeft() != -512 &&
+         "Invalidating a write of unknown cycles!");
+  assert(WS.GetCyclesLeft() <= 0 && "Invalid cycles left for this write!");
+  if (!RegisterMappings[RegID])
+    return;
+
+  assert(NumUsedMappings);
+  NumUsedMappings--;
+
+  if (RegisterMappings[RegID] == &WS)
+    RegisterMappings[RegID] = nullptr;
+
+  for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I)
+    if (RegisterMappings[*I] == &WS)
+      RegisterMappings[*I] = nullptr;
+
+  if (!ShouldInvalidateSuperRegs)
+    return;
+
+  for (MCSuperRegIterator I(RegID, &MRI); I.isValid(); ++I)
+    if (RegisterMappings[*I] == &WS)
+      RegisterMappings[*I] = nullptr;
+}
+
+// Update the number of used mappings in the event of instruction retired.
+void DispatchUnit::InvalidateRegisterMappings(const Instruction &IS) {
+  for (const auto &WS : IS.GetDefs()) {
+    DEBUG(dbgs() << "[RAT] Invalidating mapping for: ");
+    DEBUG(WS->dump());
+    RAT->InvalidateRegisterMapping(*WS.get());
+  }
+}
+
+void RegisterFile::CollectWrites(SmallVector<WriteState *, 4> &Writes,
+                                 unsigned RegID) const {
+  assert(RegID && RegID < RegisterMappings.size());
+  WriteState *WS = RegisterMappings[RegID];
+  if (WS) {
+    DEBUG(dbgs() << "Found a dependent use for RegID=" << RegID << '\n');
+    Writes.push_back(WS);
+  }
+
+  // Handle potential partial register updates.
+  for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I) {
+    WS = RegisterMappings[*I];
+    if (WS && std::find(Writes.begin(), Writes.end(), WS) == Writes.end()) {
+      DEBUG(dbgs() << "Found a dependent use of subReg " << *I << " (part of "
+                   << RegID << ")\n");
+      Writes.push_back(WS);
+    }
+  }
+}
+
+bool RegisterFile::IsAvailable(unsigned NumRegWrites) {
+  if (!TotalMappings)
+    return true;
+  if (NumRegWrites > TotalMappings) {
+    // The user specified a too small number of registers.
+    // Artificially set the number of temporaries to NumRegWrites.
+    errs() << "warning: not enough temporaries in the register file. "
+           << "The register file size has been automatically increased to "
+           << NumRegWrites << '\n';
+    TotalMappings = NumRegWrites;
+  }
+
+  return NumRegWrites + NumUsedMappings <= TotalMappings;
+}
+
+#ifndef NDEBUG
+void RegisterFile::dump() const {
+  for (unsigned i = 0; i < MRI.getNumRegs(); ++i)
+    if (RegisterMappings[i]) {
+      dbgs() << MRI.getName(i) << ", " << i << ", ";
+      RegisterMappings[i]->dump();
+    }
+
+  dbgs() << "TotalMappingsCreated: " << TotalMappingsCreated
+         << ", MaxUsedMappings: " << MaxUsedMappings
+         << ", NumUsedMappings: " << NumUsedMappings << '\n';
+}
+#endif
+
+// Reserves a number of slots, and returns a new token.
+unsigned RetireControlUnit::ReserveSlot(unsigned Index, unsigned NumMicroOps) {
+  assert(IsAvailable(NumMicroOps));
+  unsigned NormalizedQuantity =
+      std::min(NumMicroOps, static_cast<unsigned>(Queue.size()));
+  // Zero latency instructions may have zero mOps. Artificially bump this
+  // value to 1. Although zero latency instructions don't consume scheduler
+  // resources, they still consume one slot in the retire queue.
+  NormalizedQuantity = std::max(NormalizedQuantity, 1U);
+  unsigned TokenID = NextAvailableSlotIdx;
+  Queue[NextAvailableSlotIdx] = {Index, NormalizedQuantity, false};
+  NextAvailableSlotIdx += NormalizedQuantity;
+  NextAvailableSlotIdx %= Queue.size();
+  AvailableSlots -= NormalizedQuantity;
+  return TokenID;
+}
+
+void DispatchUnit::NotifyInstructionDispatched(unsigned Index) {
+  Owner->NotifyInstructionDispatched(Index);
+}
+
+void DispatchUnit::NotifyInstructionRetired(unsigned Index) {
+  Owner->NotifyInstructionRetired(Index);
+}
+
+void RetireControlUnit::CycleEvent() {
+  if (IsEmpty())
+    return;
+
+  unsigned NumRetired = 0;
+  while (!IsEmpty()) {
+    if (MaxRetirePerCycle != 0 && NumRetired == MaxRetirePerCycle)
+      break;
+    RUToken &Current = Queue[CurrentInstructionSlotIdx];
+    assert(Current.NumSlots && "Reserved zero slots?");
+    if (!Current.Executed)
+      break;
+    Owner->NotifyInstructionRetired(Current.Index);
+    CurrentInstructionSlotIdx += Current.NumSlots;
+    CurrentInstructionSlotIdx %= Queue.size();
+    AvailableSlots += Current.NumSlots;
+    NumRetired++;
+  }
+}
+
+void RetireControlUnit::OnInstructionExecuted(unsigned TokenID) {
+  assert(Queue.size() > TokenID);
+  assert(Queue[TokenID].Executed == false && Queue[TokenID].Index != ~0U);
+  Queue[TokenID].Executed = true;
+}
+
+#ifndef NDEBUG
+void RetireControlUnit::dump() const {
+  dbgs() << "Retire Unit: { Total Slots=" << Queue.size()
+         << ", Available Slots=" << AvailableSlots << " }\n";
+}
+#endif
+
+bool DispatchUnit::CheckRAT(const InstrDesc &Desc) {
+  unsigned NumWrites = Desc.Writes.size();
+  if (RAT->IsAvailable(NumWrites))
+    return true;
+  DispatchStalls[DS_RAT_REG_UNAVAILABLE]++;
+  return false;
+}
+
+bool DispatchUnit::CheckRCU(const InstrDesc &Desc) {
+  unsigned NumMicroOps = Desc.NumMicroOps;
+  if (RCU->IsAvailable(NumMicroOps))
+    return true;
+  DispatchStalls[DS_RCU_TOKEN_UNAVAILABLE]++;
+  return false;
+}
+
+bool DispatchUnit::CheckScheduler(const InstrDesc &Desc) {
+  // If this is a zero-latency instruction, then it bypasses
+  // the scheduler.
+  switch (SC->CanBeDispatched(Desc)) {
+  case Scheduler::HWS_AVAILABLE:
+    return true;
+  case Scheduler::HWS_QUEUE_UNAVAILABLE:
+    DispatchStalls[DS_SQ_TOKEN_UNAVAILABLE]++;
+    break;
+  case Scheduler::HWS_LD_QUEUE_UNAVAILABLE:
+    DispatchStalls[DS_LDQ_TOKEN_UNAVAILABLE]++;
+    break;
+  case Scheduler::HWS_ST_QUEUE_UNAVAILABLE:
+    DispatchStalls[DS_STQ_TOKEN_UNAVAILABLE]++;
+    break;
+  case Scheduler::HWS_DISPATCH_GROUP_RESTRICTION:
+    DispatchStalls[DS_DISPATCH_GROUP_RESTRICTION]++;
+  }
+
+  return false;
+}
+
+unsigned DispatchUnit::Dispatch(unsigned IID, Instruction *NewInst) {
+  assert(!CarryOver && "Cannot dispatch another instruction!");
+  unsigned NumMicroOps = NewInst->GetDesc().NumMicroOps;
+  if (NumMicroOps > DispatchWidth) {
+    assert(AvailableEntries == DispatchWidth);
+    AvailableEntries = 0;
+    CarryOver = NumMicroOps - DispatchWidth;
+  } else {
+    assert(AvailableEntries >= NumMicroOps);
+    AvailableEntries -= NumMicroOps;
+  }
+
+  // Reserve slots in the RCU.
+  unsigned RCUTokenID = RCU->ReserveSlot(IID, NumMicroOps);
+  Owner->NotifyInstructionDispatched(IID);
+
+  SC->ScheduleInstruction(IID, NewInst);
+  return RCUTokenID;
+}
+
+#ifndef NDEBUG
+void DispatchUnit::dump() const {
+  RAT->dump();
+  RCU->dump();
+
+  unsigned DSRAT = DispatchStalls[DS_RAT_REG_UNAVAILABLE];
+  unsigned DSRCU = DispatchStalls[DS_RCU_TOKEN_UNAVAILABLE];
+  unsigned DSSCHEDQ = DispatchStalls[DS_SQ_TOKEN_UNAVAILABLE];
+  unsigned DSLQ = DispatchStalls[DS_LDQ_TOKEN_UNAVAILABLE];
+  unsigned DSSQ = DispatchStalls[DS_STQ_TOKEN_UNAVAILABLE];
+
+  dbgs() << "STALLS --- RAT: " << DSRAT << ", RCU: " << DSRCU
+         << ", SCHED_QUEUE: " << DSSCHEDQ << ", LOAD_QUEUE: " << DSLQ
+         << ", STORE_QUEUE: " << DSSQ << '\n';
+}
+#endif
+
+} // namespace mca
Index: tools/llvm-mca/HWEventListener.h
===================================================================
--- tools/llvm-mca/HWEventListener.h
+++ tools/llvm-mca/HWEventListener.h
@@ -0,0 +1,34 @@
+#ifndef LLVM_TOOLS_LLVM_MCA_HWEVENTLISTENER_H
+#define LLVM_TOOLS_LLVM_MCA_HWEVENTLISTENER_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include <utility>
+
+namespace mca {
+
+using namespace llvm;
+
+struct HWEventListener {
+  // Events generated by the Retire Control Unit.
+  virtual void OnInstructionRetired(unsigned Index){};
+
+  // Events generated by the Scheduler.
+  using ResourceRef = std::pair<uint64_t, uint64_t>;
+  virtual void
+  OnInstructionIssued(unsigned Index,
+                      const ArrayRef<std::pair<ResourceRef, unsigned>> &Used) {}
+  virtual void OnInstructionExecuted(unsigned Index) {}
+  virtual void OnInstructionReady(unsigned Index) {}
+  virtual void OnResourceAvailable(const ResourceRef &RRef){};
+
+  // Events generated by the Dispatch logic.
+  virtual void OnInstructionDispatched(unsigned Index) {}
+
+  // Generic events generated by the Backend.
+  virtual void OnCycleBegin(unsigned Cycle) {}
+  virtual void OnCycleEnd(unsigned Cycle) {}
+};
+
+} // namespace mca
+
+#endif
Index: tools/llvm-mca/InstrBuilder.h
===================================================================
--- tools/llvm-mca/InstrBuilder.h
+++ tools/llvm-mca/InstrBuilder.h
@@ -0,0 +1,62 @@
+//===--------------------- InstrBuilder.h -----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// A builder class for instructions that are statically analyzed by llvm-mca.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_INSTRBUILDER_H
+#define LLVM_TOOLS_LLVM_MCA_INSTRBUILDER_H
+
+#include "Dispatch.h"
+#include "Instruction.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+
+namespace mca {
+
+using namespace llvm;
+
+class DispatchUnit;
+
+/// \brief A builder class that knows how to construct Instruction objects.
+///
+/// Every llvm-mca Instruction is described by an object of class InstrDesc.
+/// An InstrDesc describes which registers are read/written by the instruction,
+/// as well as the instruction latency and hardware resources consumed.
+///
+/// This class is used by the tool to construct Instructions and instruction
+/// descriptors (i.e. InstrDesc objects).
+/// Information from the machine scheduling model is used to identify processor
+/// resources that are consumed by an instruction.
+class InstrBuilder {
+  const MCInstrInfo &MCII;
+  const ArrayRef<uint64_t> ProcResourceMasks;
+
+  DenseMap<unsigned short, std::unique_ptr<const InstrDesc>> Descriptors;
+  DenseMap<unsigned, std::unique_ptr<Instruction>> Instructions;
+
+  void CreateInstrDescImpl(const MCSubtargetInfo &STI, const MCInst &MCI);
+
+public:
+  InstrBuilder(const MCInstrInfo &mcii, const ArrayRef<uint64_t> Masks)
+      : MCII(mcii), ProcResourceMasks(Masks) {}
+
+  const InstrDesc &GetOrCreateInstrDesc(const MCSubtargetInfo &STI,
+                                        const MCInst &MCI);
+
+  Instruction *CreateInstruction(const MCSubtargetInfo &STI, DispatchUnit &DU,
+                                 unsigned Idx, const MCInst &MCI);
+};
+
+} // namespace mca
+
+#endif
Index: tools/llvm-mca/InstrBuilder.cpp
===================================================================
--- tools/llvm-mca/InstrBuilder.cpp
+++ tools/llvm-mca/InstrBuilder.cpp
@@ -0,0 +1,530 @@
+//===--------------------- InstrBuilder.cpp ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements the InstrBuilder interface.
+///
+//===----------------------------------------------------------------------===//
+
+#include "InstrBuilder.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace mca {
+
+static void
+initializeUsedResources(InstrDesc &ID, const MCSchedClassDesc &SCDesc,
+                        const MCSubtargetInfo &STI,
+                        const ArrayRef<uint64_t> ProcResourceMasks) {
+  const MCSchedModel &SM = STI.getSchedModel();
+
+  // Populate resources consumed.
+  using ResourcePlusCycles = std::pair<uint64_t, ResourceUsage>;
+  std::vector<ResourcePlusCycles> Worklist;
+  for (unsigned i = 0; i < SCDesc.NumWriteProcResEntries; ++i) {
+    const MCWriteProcResEntry *PRE = STI.getWriteProcResBegin(&SCDesc) + i;
+    const MCProcResourceDesc &PR = *SM.getProcResource(PRE->ProcResourceIdx);
+    uint64_t Mask = ProcResourceMasks[PRE->ProcResourceIdx];
+    if (PR.BufferSize != -1)
+      ID.Buffers.push_back(Mask);
+    CycleSegment RCy(0, PRE->Cycles, false);
+    Worklist.emplace_back(ResourcePlusCycles(Mask, ResourceUsage(RCy)));
+  }
+
+  // Sort elements by mask popcount, so that we prioritize resource units over
+  // resource groups, and smaller groups over larger groups.
+  std::sort(Worklist.begin(), Worklist.end(),
+            [](const ResourcePlusCycles &A, const ResourcePlusCycles &B) {
+              unsigned popcntA = countPopulation(A.first);
+              unsigned popcntB = countPopulation(B.first);
+              if (popcntA < popcntB)
+                return true;
+              if (popcntA > popcntB)
+                return false;
+              return A.first < B.first;
+            });
+
+  uint64_t UsedResourceUnits = 0;
+
+  // Remove cycles contributed by smaller resources.
+  for (unsigned i = 0; i < Worklist.size(); ++i) {
+    ResourcePlusCycles &A = Worklist[i];
+    if (!A.second.size()) {
+      A.second.NumUnits = 0;
+      A.second.SetReserved();
+      ID.Resources.emplace_back(A);
+      continue;
+    }
+
+    ID.Resources.emplace_back(A);
+    uint64_t NormalizedMask = A.first;
+    if (countPopulation(A.first) == 1) {
+      UsedResourceUnits |= A.first;
+    } else {
+      // Remove the leading 1 from the resource group mask.
+      NormalizedMask ^= PowerOf2Floor(NormalizedMask);
+    }
+
+    for (unsigned j = i + 1; j < Worklist.size(); ++j) {
+      ResourcePlusCycles &B = Worklist[j];
+      if ((NormalizedMask & B.first) == NormalizedMask) {
+        B.second.CS.Subtract(A.second.size());
+        if (countPopulation(B.first) > 1)
+          B.second.NumUnits++;
+      }
+    }
+  }
+
+  // A SchedWrite may specify a number of cycles in which a resource group
+  // is reserved. For example (on target x86; cpu Haswell):
+  //
+  //  SchedWriteRes<[HWPort0, HWPort1, HWPort01]> {
+  //    let ResourceCycles = [2, 2, 3];
+  //  }
+  //
+  // This means:
+  // Resource units HWPort0 and HWPort1 are both used for 2cy.
+  // Resource group HWPort01 is the union of HWPort0 and HWPort1.
+  // Since this write touches both HWPort0 and HWPort1 for 2cy, HWPort01
+  // will not be usable for 2 entire cycles from instruction issue.
+  //
+  // On top of those 2cy, SchedWriteRes explicitly specifies an extra latency
+  // of 3 cycles for HWPort01. This tool assumes that the 3cy latency is an
+  // extra delay on top of the 2 cycles latency.
+  // During those extra cycles, HWPort01 is not usable by other instructions.
+  for (ResourcePlusCycles &RPC : ID.Resources) {
+    if (countPopulation(RPC.first) > 1 && !RPC.second.IsReserved()) {
+      // Remove the leading 1 from the resource group mask.
+      uint64_t Mask = RPC.first ^ PowerOf2Floor(RPC.first);
+      if ((Mask & UsedResourceUnits) == Mask)
+        RPC.second.SetReserved();
+    }
+  }
+
+  DEBUG(for (const auto &R
+             : ID.Resources) dbgs()
+            << "\t\tMask=" << R.first << ", cy=" << R.second.size() << '\n';
+        for (const auto &R
+             : ID.Buffers) dbgs()
+        << "\t\tBuffer Mask=" << R << '\n';);
+}
+
+static void computeMaxLatency(InstrDesc &ID, const MCInstrDesc &MCDesc,
+                              const MCSchedClassDesc &SCDesc,
+                              const MCSubtargetInfo &STI) {
+  unsigned MaxLatency = 0;
+  unsigned NumWriteLatencyEntries = SCDesc.NumWriteLatencyEntries;
+  for (unsigned i = 0; i < NumWriteLatencyEntries; ++i) {
+    int Cycles = STI.getWriteLatencyEntry(&SCDesc, i)->Cycles;
+    // Check if this is an unknown latency. Conservatively (pessimistically)
+    // assume a latency of 100cy if late.
+    if (Cycles == -1)
+      Cycles = 100;
+    MaxLatency = std::max(MaxLatency, static_cast<unsigned>(Cycles));
+  }
+
+  if (MCDesc.isCall()) {
+    // We cannot estimate how long this call will take.
+    // Artificially set an arbitrarily high latency (100cy).
+    MaxLatency = std::max(100U, MaxLatency);
+  }
+
+  // Check if this instruction consumes any processor resources.
+  // If the latency is unknown, then conservatively set it equal to the maximum
+  // number of cycles for a resource consumed by this instruction.
+  if (!MaxLatency && ID.Resources.size()) {
+    // Check if this instruction consumes any processor resources.
+    // If so, then compute the max of the cycles spent for each resource, and
+    // use it as the MaxLatency.
+    for (const auto &Resource : ID.Resources)
+      MaxLatency = std::max(MaxLatency, Resource.second.size());
+  }
+
+  if (SCDesc.isVariant() && MaxLatency == 0) {
+    errs() << "note: unknown latency for a variant opcode. Conservatively"
+           << " assume a default latency of 1cy.\n";
+    MaxLatency = 1;
+  }
+
+  ID.MaxLatency = MaxLatency;
+}
+
+static void populateWrites(InstrDesc &ID, const MCInst &MCI,
+                           const MCInstrDesc &MCDesc,
+                           const MCSchedClassDesc &SCDesc,
+                           const MCSubtargetInfo &STI) {
+  computeMaxLatency(ID, MCDesc, SCDesc, STI);
+
+  // Set if writes through this opcode may update super registers.
+  // TODO: on x86-64, a 4 byte write of a general purpose register always
+  // fully updates the super-register.
+  // More in general, (at least on x86) not all register writes perform
+  // a partial (super-)register update.
+  // For example, an AVX instruction that writes on a XMM register implicitly
+  // zeroes the upper half of every aliasing super-register.
+  //
+  // For now, we pessimistically assume that writes are all potentially
+  // partial register updates. This is a good default for most targets, execept
+  // for those like x86 which implement a special semantic for certain opcodes.
+  // At least on x86, this may lead to an inaccurate prediction of the
+  // instruction level parallelism.
+  bool FullyUpdatesSuperRegisters = false;
+
+  // Now Populate Writes.
+
+  // This algorithm currently works under the strong (and potentially incorrect)
+  // assumption that information related to register def/uses can be obtained
+  // from MCInstrDesc.
+  //
+  // However class MCInstrDesc is used to describe MachineInstr objects and not
+  // MCInst objects. To be more specific, MCInstrDesc objects are opcode
+  // descriptors that are automatically generated via tablegen based on the
+  // instruction set information available from the target .td files.  That
+  // means, the number of (explicit) definitions according to MCInstrDesc alwyas
+  // matches the cardinality of the `(outs)` set in tablegen.
+  //
+  // By constructions, definitions must appear first in the operand sequence of
+  // a MachineInstr. Also, the (outs) sequence is preserved (example: the first
+  // element in the outs set is the first operand in the corresponding
+  // MachineInstr).  That's the reason why MCInstrDesc only need to declare the
+  // total number of register definitions, and not where those definitions are
+  // in the machine operand sequence.
+  //
+  // Unfortunately, it is not safe to use the information from MCInstrDesc to
+  // also describe MCInst objects. An MCInst object can be obtained from a
+  // MachineInstr through a lowering step which may restructure the operand
+  // sequence (and even remove or introduce new operands). So, there is a high
+  // risk that the lowering step breaks the assumptions that register
+  // definitions are always at the beginning of the machine operand sequence.
+  //
+  // This is a fundamental problem, and it is still an open problem. Essentially
+  // we have to find a way to correlate def/use operands of a MachineInstr to
+  // operands of an MCInst. Otherwise, we cannot correctly reconstruct data
+  // dependencies, nor we can correctly interpret the scheduling model, which
+  // heavily uses machine operand indices to define processor read-advance
+  // information, and to identify processor write resources.  Essentially, we
+  // need something like a MCInstrDesc, but for MCInst.  But we also need a way
+  // to map MCInst operands back to MachineInstr operands, in order to be able
+  // to correctly identify register definitions, register uses, and read-advance
+  // information.
+
+  // This requires that to compute operand indices related to MCInst register
+  // definitions. This information could be automatically generated via tablegen
+  // at least for all the opcodes that don't require a custom lowering step from
+  // MachineInstr to MCInst.  As long as we can correlate register definitions
+  // from a MachineInstr to register definitions in a MCInst, then we are done.
+  //
+  // Unfortunately, we don't have that information now. So, this prototype
+  // currently work under the strong (and potentially flawed) assumption that we
+  // can always safely trust the content of an MCInstrDesc.  For example, we can
+  // query a MCInstrDesc to obtain the number of explicit and implicit register
+  // defintions.  We also assume that register definitions always come first in
+  // the operand sequence. This last assumption normally makes sense for
+  // MachineInstr, where register definitions always appear at the beginning of
+  // the operands sequence. In reality, these assumptions could be broken by the
+  // lowering step, which can decide to lay out operands in a different order
+  // than the original order of operand as specified by the MachineInstr.
+  //
+  // Things get even more complicated in the presence of "optional" register
+  // definitions. For MachineInstr, optional register definitions are always at
+  // the end of the operand sequence. Some ARM instructions that may update the
+  // status flags specify that register as a optional operand.  Since we don't
+  // have operand descriptors for MCInst, we assume for now that the optional
+  // definition is always the last operand of a MCInst.  Again, this assumption
+  // may be okay for most targets. However, there is no guarantee that targets
+  // would respect that.
+  //
+  // In conclusion: these are for now the strong assumptions made by the tool:
+  //  * The number of explicit and implicit register definitions in a MCInst
+  //    matches the number of explicit and implicit definitions according to
+  //    the opcode descriptor (MCInstrDesc).
+  //  * Register definitions take precedence over register uses in the operands
+  //    list.
+  //  * If an opcode specifies an optional definition, then the optional
+  //    definition is always the last operand in the sequence, and it can be
+  //    set to zero (i.e. "no register").
+  //
+  // These assumptions luckily work quite well for some out-of-order
+  // in-tree targets like x86. This is mainly because the vast majority of
+  // the instructions is expanded to MCInst in a very straightforward way.
+  // So, the lowering of a MachineOperand is often implemented as a loop of
+  // "lower operand", which preservers the original order of the operands.
+  //
+  // In the longer term, we obviously need a proper solution for this issue.
+  unsigned NumExplicitDefs = MCDesc.getNumDefs();
+  unsigned NumImplicitDefs = MCDesc.getNumImplicitDefs();
+  unsigned NumWriteLatencyEntries = SCDesc.NumWriteLatencyEntries;
+  unsigned TotalDefs = NumExplicitDefs + NumImplicitDefs;
+  if (MCDesc.hasOptionalDef())
+    TotalDefs++;
+  ID.Writes.resize(TotalDefs);
+  // Iterate over the operands list, and skip non-register operands.
+  // The first NumExplictDefs register operands are expected to be register
+  // definitions.
+  unsigned CurrentDef = 0;
+  unsigned i = 0;
+  for (; i < MCI.getNumOperands() && CurrentDef < NumExplicitDefs; ++i) {
+    const MCOperand &Op = MCI.getOperand(i);
+    if (!Op.isReg())
+      continue;
+
+    WriteDescriptor &Write = ID.Writes[CurrentDef];
+    Write.OpIndex = i;
+    if (CurrentDef < NumWriteLatencyEntries) {
+      const MCWriteLatencyEntry &WLE =
+          *STI.getWriteLatencyEntry(&SCDesc, CurrentDef);
+      // Conservatively default to MaxLatency.
+      Write.Latency = WLE.Cycles == -1 ? ID.MaxLatency : WLE.Cycles;
+      Write.SClassOrWriteResourceID = WLE.WriteResourceID;
+    } else {
+      // Assign a default latency for this write.
+      Write.Latency = ID.MaxLatency;
+      Write.SClassOrWriteResourceID = 0;
+    }
+    Write.FullyUpdatesSuperRegs = FullyUpdatesSuperRegisters;
+    Write.IsOptionalDef = false;
+    DEBUG(dbgs() << "\t\tOpIdx=" << Write.OpIndex
+                 << ", Latency=" << Write.Latency << ", WriteResourceID="
+                 << Write.SClassOrWriteResourceID << '\n');
+    CurrentDef++;
+  }
+
+  if (CurrentDef != NumExplicitDefs)
+    llvm::report_fatal_error(
+        "error: Expected more register operand definitions. ");
+
+  CurrentDef = 0;
+  for (CurrentDef = 0; CurrentDef < NumImplicitDefs; ++CurrentDef) {
+    unsigned Index = NumExplicitDefs + CurrentDef;
+    WriteDescriptor &Write = ID.Writes[Index];
+    Write.OpIndex = -1;
+    Write.RegisterID = MCDesc.getImplicitDefs()[CurrentDef];
+    Write.Latency = ID.MaxLatency;
+    Write.SClassOrWriteResourceID = 0;
+    Write.IsOptionalDef = false;
+    assert(Write.RegisterID != 0 && "Expected a valid phys register!");
+    DEBUG(dbgs() << "\t\tOpIdx=" << Write.OpIndex << ", PhysReg="
+                 << Write.RegisterID << ", Latency=" << Write.Latency
+                 << ", WriteResourceID=" << Write.SClassOrWriteResourceID
+                 << '\n');
+  }
+
+  if (MCDesc.hasOptionalDef()) {
+    // Always assume that the optional definition is the last operand of the
+    // MCInst sequence.
+    const MCOperand &Op = MCI.getOperand(MCI.getNumOperands() - 1);
+    if (i == MCI.getNumOperands() || !Op.isReg())
+      llvm::report_fatal_error(
+          "error: expected a register operand for an optional "
+          "definition. Instruction has not be correctly analyzed.\n",
+          false);
+
+    WriteDescriptor &Write = ID.Writes[TotalDefs - 1];
+    Write.OpIndex = MCI.getNumOperands() - 1;
+    // Assign a default latency for this write.
+    Write.Latency = ID.MaxLatency;
+    Write.SClassOrWriteResourceID = 0;
+    Write.IsOptionalDef = true;
+  }
+}
+
+static void populateReads(InstrDesc &ID, const MCInst &MCI,
+                          const MCInstrDesc &MCDesc,
+                          const MCSchedClassDesc &SCDesc,
+                          const MCSubtargetInfo &STI) {
+  unsigned SchedClassID = MCDesc.getSchedClass();
+  bool HasReadAdvanceEntries = SCDesc.NumReadAdvanceEntries > 0;
+
+  unsigned i = 0;
+  unsigned NumExplicitDefs = MCDesc.getNumDefs();
+  // Skip explicit definitions.
+  for (; i < MCI.getNumOperands() && NumExplicitDefs; ++i) {
+    const MCOperand &Op = MCI.getOperand(i);
+    if (Op.isReg())
+      NumExplicitDefs--;
+  }
+
+  if (NumExplicitDefs)
+    llvm::report_fatal_error(
+        "error: Expected more register operand definitions. ", false);
+
+  unsigned NumExplicitUses = MCI.getNumOperands() - i;
+  unsigned NumImplicitUses = MCDesc.getNumImplicitUses();
+  if (MCDesc.hasOptionalDef()) {
+    assert(NumExplicitUses);
+    NumExplicitUses--;
+  }
+  unsigned TotalUses = NumExplicitUses + NumImplicitUses;
+  if (!TotalUses)
+    return;
+
+  ID.Reads.resize(TotalUses);
+  for (unsigned CurrentUse = 0; CurrentUse < NumExplicitUses; ++CurrentUse) {
+    ReadDescriptor &Read = ID.Reads[CurrentUse];
+    Read.OpIndex = i + CurrentUse;
+    Read.HasReadAdvanceEntries = HasReadAdvanceEntries;
+    Read.SchedClassID = SchedClassID;
+    DEBUG(dbgs() << "\t\tOpIdx=" << Read.OpIndex);
+  }
+
+  for (unsigned CurrentUse = 0; CurrentUse < NumImplicitUses; ++CurrentUse) {
+    ReadDescriptor &Read = ID.Reads[NumExplicitUses + CurrentUse];
+    Read.OpIndex = -1;
+    Read.RegisterID = MCDesc.getImplicitUses()[CurrentUse];
+    Read.HasReadAdvanceEntries = false;
+    Read.SchedClassID = SchedClassID;
+    DEBUG(dbgs() << "\t\tOpIdx=" << Read.OpIndex
+                 << ", RegisterID=" << Read.RegisterID << '\n');
+  }
+}
+
+void InstrBuilder::CreateInstrDescImpl(const MCSubtargetInfo &STI,
+                                       const MCInst &MCI) {
+  assert(STI.getSchedModel().hasInstrSchedModel() &&
+         "Itineraries are not yet supported!");
+
+  unsigned short Opcode = MCI.getOpcode();
+  // Obtain the instruction descriptor from the opcode.
+  const MCInstrDesc &MCDesc = MCII.get(Opcode);
+  const MCSchedModel &SM = STI.getSchedModel();
+
+  // Then obtain the scheduling class information from the instruction.
+  const MCSchedClassDesc &SCDesc =
+      *SM.getSchedClassDesc(MCDesc.getSchedClass());
+
+  // Create a new empty descriptor.
+  InstrDesc *ID = new InstrDesc();
+
+  if (SCDesc.isVariant()) {
+    errs() << "warning: don't know how to model variant opcodes.\n"
+           << "note: assume 1 micro opcode.\n";
+    ID->NumMicroOps = 1U;
+  } else {
+    ID->NumMicroOps = SCDesc.NumMicroOps;
+  }
+
+  if (MCDesc.isCall()) {
+    // We don't correctly model calls.
+    errs() << "warning: found a call in the input assembly sequence.\n"
+           << "note: call instructions are not correctly modeled. Assume a "
+              "latency of 100cy.\n";
+  }
+
+  if (MCDesc.isReturn()) {
+    errs() << "warning: found a return instruction in the input assembly "
+              "sequence.\n"
+           << "note: program counter updates are ignored.\n";
+  }
+
+  ID->MayLoad = MCDesc.mayLoad();
+  ID->MayStore = MCDesc.mayStore();
+  ID->HasSideEffects = MCDesc.hasUnmodeledSideEffects();
+
+  initializeUsedResources(*ID, SCDesc, STI, ProcResourceMasks);
+  populateWrites(*ID, MCI, MCDesc, SCDesc, STI);
+  populateReads(*ID, MCI, MCDesc, SCDesc, STI);
+
+  DEBUG(dbgs() << "\t\tMaxLatency=" << ID->MaxLatency << '\n');
+  DEBUG(dbgs() << "\t\tNumMicroOps=" << ID->NumMicroOps << '\n');
+
+  // Now add the new descriptor.
+  Descriptors[Opcode] = std::unique_ptr<const InstrDesc>(ID);
+}
+
+const InstrDesc &InstrBuilder::GetOrCreateInstrDesc(const MCSubtargetInfo &STI,
+                                                    const MCInst &MCI) {
+  auto it = Descriptors.find(MCI.getOpcode());
+  if (it == Descriptors.end())
+    CreateInstrDescImpl(STI, MCI);
+  return *Descriptors[MCI.getOpcode()].get();
+}
+
+Instruction *InstrBuilder::CreateInstruction(const MCSubtargetInfo &STI,
+                                             DispatchUnit &DU, unsigned Idx,
+                                             const MCInst &MCI) {
+  const InstrDesc &D = GetOrCreateInstrDesc(STI, MCI);
+  Instruction *NewIS = new Instruction(D);
+
+  // Populate Reads first.
+  const MCSchedModel &SM = STI.getSchedModel();
+  SmallVector<WriteState *, 4> DependentWrites;
+  for (const ReadDescriptor &RD : D.Reads) {
+    int RegID = -1;
+    if (RD.OpIndex != -1) {
+      // explicit read.
+      const MCOperand &Op = MCI.getOperand(RD.OpIndex);
+      // Skip non-register operands.
+      if (!Op.isReg())
+        continue;
+      RegID = Op.getReg();
+    } else {
+      // Implicit read.
+      RegID = RD.RegisterID;
+    }
+
+    // Skip invalid register operands.
+    if (!RegID)
+      continue;
+
+    // Okay, this is a register operand. Create a ReadState for it.
+    assert(RegID > 0 && "Invalid register ID found!");
+    ReadState *NewRDS = new ReadState(RD);
+    NewIS->GetUses().emplace_back(std::unique_ptr<ReadState>(NewRDS));
+    DU.CollectWrites(DependentWrites, RegID);
+    NewRDS->SetDependentWrites(DependentWrites.size());
+    DEBUG(dbgs() << "Found " << DependentWrites.size()
+                 << " dependent writes\n");
+
+    // We know that this read depends on all the writes in DependentWrites.
+    // For each write, check if we have ReadAdvance information, and use it
+    // to figure out after how many cycles this read becomes available.
+    if (!RD.HasReadAdvanceEntries) {
+      for (WriteState *WS : DependentWrites)
+        WS->AddUser(NewRDS, /* ReadAdvance */ 0);
+      // Prepare the set for another round.
+      DependentWrites.clear();
+      continue;
+    }
+
+    const MCSchedClassDesc *SC = SM.getSchedClassDesc(RD.SchedClassID);
+    for (WriteState *WS : DependentWrites) {
+      unsigned WriteResID = WS->GetWriteResourceID();
+      int ReadAdvance = STI.getReadAdvanceCycles(SC, RD.OpIndex, WriteResID);
+      WS->AddUser(NewRDS, ReadAdvance);
+    }
+
+    // Prepare the set for another round.
+    DependentWrites.clear();
+  }
+
+  // Now populate writes.
+  for (const WriteDescriptor &WD : D.Writes) {
+    unsigned RegID =
+        WD.OpIndex == -1 ? WD.RegisterID : MCI.getOperand(WD.OpIndex).getReg();
+    assert((RegID || WD.IsOptionalDef) && "Expected a valid register ID!");
+    // Special case where this is a optional definition, and the actual register
+    // is 0.
+    if (WD.IsOptionalDef && !RegID)
+      continue;
+
+    WriteState *NewWS = new WriteState(WD);
+    NewIS->GetDefs().emplace_back(std::unique_ptr<WriteState>(NewWS));
+    NewWS->SetRegisterID(RegID);
+    DU.AddNewRegisterMapping(NewWS);
+  }
+
+  // Update Latency.
+  NewIS->SetCyclesLeft(D.MaxLatency);
+  return NewIS;
+}
+
+} // namespace mca
Index: tools/llvm-mca/Instruction.h
===================================================================
--- tools/llvm-mca/Instruction.h
+++ tools/llvm-mca/Instruction.h
@@ -0,0 +1,336 @@
+//===--------------------- Instruction.h ------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines abstractions used by the Backend to model register reads,
+/// register writes and instructions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_INSTRUCTION_H
+#define LLVM_TOOLS_LLVM_MCA_INSTRUCTION_H
+
+#include "llvm/Support/MathExtras.h"
+#include <memory>
+#include <set>
+#include <vector>
+
+namespace mca {
+
+using namespace llvm;
+
+struct WriteDescriptor;
+struct ReadDescriptor;
+class WriteState;
+class ReadState;
+
+constexpr int UNKNOWN_CYCLES = -512;
+
+/// \brief A register write descriptor.
+struct WriteDescriptor {
+  int OpIndex; // Operand index. -1 if this is an implicit write.
+  // Write latency. Number of cycles before write-back stage.
+  int Latency;
+  // This field is set to a value different than zero only if this
+  // is an implicit definition.
+  unsigned RegisterID;
+  // True if this write generates a partial update of a super-registers.
+  // On X86, this flag is set by byte/word writes on GPR registers. Also,
+  // a write of an XMM register only partially updates the corresponding
+  // YMM super-register if the write is associated to a legacy SSE instruction.
+  bool FullyUpdatesSuperRegs;
+  // Instruction itineraries would set this field to the SchedClass ID.
+  // Otherwise, it defaults to the WriteResourceID from teh MCWriteLatencyEntry
+  // element associated to this write.
+  // When computing read latencies, this value is matched against the
+  // "ReadAdvance" information. The hardware backend may implement
+  // dedicated forwarding paths to quickly propagate write results to dependent
+  // instructions waiting in the reservation station (effectively bypassing the
+  // write-back stage).
+  unsigned SClassOrWriteResourceID;
+  // True only if this is a write obtained from an optional definition.
+  // Optional definitions are allowed to reference regID zero (i.e. "no
+  // register").
+  bool IsOptionalDef;
+};
+
+/// \brief A register read descriptor.
+struct ReadDescriptor {
+  // This field defaults to -1 if this is an implicit read.
+  int OpIndex;
+  // This field is only set if this is an implicit read.
+  unsigned RegisterID;
+  // Scheduling Class Index. It is used to query the scheduling model for the
+  // MCSchedClassDesc object.
+  unsigned SchedClassID;
+  // True if there may be a local forwarding logic in hardware to serve a
+  // write used by this read. This information, along with SchedClassID, is
+  // used to dynamically check at Instruction creation time, if the input
+  // operands can benefit from a ReadAdvance bonus.
+  bool HasReadAdvanceEntries;
+};
+
+/// \brief Tracks uses of a register definition (e.g. register write).
+///
+/// Each implicit/explicit register write is associated with an instance of
+/// this class. A WriteState object tracks the dependent users of a
+/// register write. It also tracks how many cycles are left before the write
+/// back stage.
+class WriteState {
+  const WriteDescriptor &WD;
+  // On instruction issue, this field is set equal to the write latency.
+  // Before instruction issue, this field defaults to -512, a special
+  // value that represents an "unknown" number of cycles.
+  int CyclesLeft;
+
+  // Actual register defined by this write. This field is only used
+  // to speedup queries on the register file.
+  // For implicit writes, this field always matches the value of
+  // field RegisterID from WD.
+  unsigned RegisterID;
+
+  // A list of dependent reads. Users is a set of dependent
+  // reads. A dependent read is added to the set only if CyclesLeft
+  // is "unknown". As soon as CyclesLeft is 'known', each user in the set
+  // gets notified with the actual CyclesLeft.
+
+  // The 'second' element of a pair is a "ReadAdvance" number of cycles.
+  std::set<std::pair<ReadState *, int>> Users;
+
+public:
+  WriteState(const WriteDescriptor &Desc)
+      : WD(Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(Desc.RegisterID) {}
+  WriteState(const WriteState &Other) = delete;
+  WriteState &operator=(const WriteState &Other) = delete;
+
+  int GetCyclesLeft() const { return CyclesLeft; }
+  unsigned GetWriteResourceID() const { return WD.SClassOrWriteResourceID; }
+  unsigned GetRegisterID() const { return RegisterID; }
+  void SetRegisterID(unsigned ID) { RegisterID = ID; }
+
+  void AddUser(ReadState *Use, int ReadAdvance);
+  bool FullyUpdatesSuperRegs() const { return WD.FullyUpdatesSuperRegs; }
+  bool IsWrittenBack() const { return CyclesLeft == 0; }
+
+  // On every cycle, update CyclesLeft and notify dependent users.
+  void CycleEvent();
+  void OnInstructionIssued();
+
+#ifndef NDEBUG
+  void dump() const;
+#endif
+};
+
+/// \brief Tracks register operand latency in cycles.
+///
+/// A read may be dependent on more than one write. This occurs when some
+/// writes only partially update the register associated to this read.
+class ReadState {
+  const ReadDescriptor &RD;
+  unsigned DependentWrites;
+  int CyclesLeft;
+  unsigned TotalCycles;
+
+public:
+  bool IsReady() const {
+    if (DependentWrites)
+      return false;
+    return (CyclesLeft == UNKNOWN_CYCLES || CyclesLeft == 0);
+  }
+
+  ReadState(const ReadDescriptor &Desc)
+      : RD(Desc), DependentWrites(0), CyclesLeft(UNKNOWN_CYCLES),
+        TotalCycles(0) {}
+  ReadState(const ReadState &Other) = delete;
+  ReadState &operator=(const ReadState &Other) = delete;
+
+  const ReadDescriptor &GetDescriptor() const { return RD; }
+  unsigned GetSchedClass() const { return RD.SchedClassID; }
+  void CycleEvent();
+  void WriteStartEvent(unsigned Cycles);
+  void SetDependentWrites(unsigned Writes) { DependentWrites = Writes; }
+};
+
+/// \brief A sequence of cycles.
+///
+/// This class can be used as a building block to construct ranges of cycles.
+class CycleSegment {
+  unsigned Begin; // Inclusive.
+  unsigned End;   // Exclusive.
+  bool Reserved;  // Resources associated to this segment must be reserved.
+
+public:
+  CycleSegment(unsigned StartCycle, unsigned EndCycle, bool IsReserved = false)
+      : Begin(StartCycle), End(EndCycle), Reserved(IsReserved) {}
+
+  bool Contains(unsigned Cycle) const { return Cycle >= Begin && Cycle < End; }
+  bool StartsAfter(const CycleSegment &CS) const { return End <= CS.Begin; }
+  bool EndsBefore(const CycleSegment &CS) const { return Begin >= CS.End; }
+  bool Overlaps(const CycleSegment &CS) const {
+    return !StartsAfter(CS) && !EndsBefore(CS);
+  }
+  bool IsExecuting() const { return Begin == 0 && End != 0; }
+  bool IsExecuted() const { return End == 0; }
+  bool operator<(const CycleSegment &Other) const {
+    return Begin < Other.Begin;
+  }
+  CycleSegment &operator--(void) {
+    if (Begin)
+      Begin--;
+    if (End)
+      End--;
+    return *this;
+  }
+
+  bool IsValid() const { return Begin <= End; }
+  unsigned size() const { return End - Begin; };
+  void Subtract(unsigned Cycles) {
+    assert(End >= Cycles);
+    End -= Cycles;
+  }
+
+  unsigned begin() const { return Begin; }
+  unsigned end() const { return End; }
+  void setEnd(unsigned NewEnd) { End = NewEnd; }
+  bool IsReserved() const { return Reserved; }
+  void SetReserved() { Reserved = true; }
+};
+
+/// \brief Helper used by class InstrDesc to describe how hardware resources
+/// are used.
+///
+/// This class describes how many resource units of a specific resource kind
+/// (and how many cycles) are "used" by an instruction.
+struct ResourceUsage {
+  CycleSegment CS;
+  unsigned NumUnits;
+  ResourceUsage(CycleSegment Cycles, unsigned Units = 1)
+      : CS(Cycles), NumUnits(Units) {}
+  unsigned size() const { return CS.size(); }
+  bool IsReserved() const { return CS.IsReserved(); }
+  void SetReserved() { CS.SetReserved(); }
+};
+
+/// \brief An instruction descriptor
+struct InstrDesc {
+  std::vector<WriteDescriptor> Writes; // Implicit writes are at the end.
+  std::vector<ReadDescriptor> Reads;   // Implicit reads are at the end.
+
+  // For every resource used by an instruction of this kind, this vector
+  // reports the number of "consumed cycles".
+  std::vector<std::pair<uint64_t, ResourceUsage>> Resources;
+
+  // A list of buffered resources consumed by this instruction.
+  std::vector<uint64_t> Buffers;
+  unsigned MaxLatency;
+  // Number of MicroOps for this instruction.
+  unsigned NumMicroOps;
+
+  bool MayLoad;
+  bool MayStore;
+  bool HasSideEffects;
+};
+
+/// An instruction dispatched to the out-of-order backend.
+///
+/// This class is used to monitor changes in the internal state of instructions
+/// that are dispatched by the DispatchUnit to the hardware schedulers.
+class Instruction {
+  const InstrDesc &Desc;
+
+  enum InstrStage {
+    IS_INVALID,   // Instruction in an invalid state.
+    IS_AVAILABLE, // Instruction dispatched but operands are not ready.
+    IS_READY,     // Instruction dispatched and operands ready.
+    IS_EXECUTING, // Instruction issued.
+    IS_EXECUTED,  // Instruction executed. Values are written back.
+    IS_RETIRED    // Instruction retired.
+  };
+
+  // The current instruction stage.
+  enum InstrStage Stage;
+
+  // This value defaults to the instruction latency. This instruction is
+  // considered executed when field CyclesLeft goes to zero.
+  int CyclesLeft;
+
+  // Retire Unit token ID for this instruction.
+  unsigned RCUTokenID;
+
+  using VecDefs = std::vector<std::unique_ptr<WriteState>>;
+  using VecUses = std::vector<std::unique_ptr<ReadState>>;
+
+  // Output dependencies.
+  // One entry per each implicit and explicit register definition.
+  VecDefs Defs;
+
+  // Input dependencies.
+  // One entry per each implicit and explicit register use.
+  VecUses Uses;
+
+  // This instruction has already been dispatched, and all operands are ready.
+  void SetReady() {
+    assert(Stage == IS_AVAILABLE);
+    Stage = IS_READY;
+  }
+
+public:
+  Instruction(const InstrDesc &D)
+      : Desc(D), Stage(IS_INVALID), CyclesLeft(-1) {}
+  Instruction(const Instruction &Other) = delete;
+  Instruction &operator=(const Instruction &Other) = delete;
+
+  VecDefs &GetDefs() { return Defs; }
+  const VecDefs &GetDefs() const { return Defs; }
+  VecUses &GetUses() { return Uses; }
+  const VecUses &GetUses() const { return Uses; }
+  const InstrDesc &GetDesc() const { return Desc; }
+
+  unsigned GetRCUTokenID() const { return RCUTokenID; }
+  int GetCyclesLeft() const { return CyclesLeft; }
+  void SetCyclesLeft(int Cycles) { CyclesLeft = Cycles; }
+  void SetRCUTokenID(unsigned TokenID) { RCUTokenID = TokenID; }
+
+  // Transition to the dispatch stage.
+  // No definition is updated because the instruction is not "executing".
+  void Dispatch() {
+    assert(Stage == IS_INVALID);
+    Stage = IS_AVAILABLE;
+  }
+
+  // Instruction issued. Transition to the IS_EXECUTING state, and update
+  // all the definitions.
+  void Execute();
+
+  void ForceExecuted() {
+    assert((Stage == IS_INVALID && IsZeroLatency()) ||
+           (Stage == IS_READY && Desc.MaxLatency == 0));
+    Stage = IS_EXECUTED;
+  }
+
+  // Checks if operands are available. If all operands area ready,
+  // then this forces a transition from IS_AVAILABLE to IS_READY.
+  bool IsReady();
+
+  bool IsDispatched() const { return Stage == IS_AVAILABLE; }
+  bool IsExecuting() const { return Stage == IS_EXECUTING; }
+  bool IsExecuted() const { return Stage == IS_EXECUTED; }
+  bool IsZeroLatency() const;
+
+  void Retire() {
+    assert(Stage == IS_EXECUTED);
+    Stage = IS_RETIRED;
+  }
+
+  void CycleEvent();
+};
+
+} // namespace mca
+
+#endif
Index: tools/llvm-mca/Instruction.cpp
===================================================================
--- tools/llvm-mca/Instruction.cpp
+++ tools/llvm-mca/Instruction.cpp
@@ -0,0 +1,132 @@
+//===--------------------- Instruction.cpp ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines abstractions used by the Backend to model register reads,
+// register writes and instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Instruction.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mca {
+
+void ReadState::WriteStartEvent(unsigned Cycles) {
+  assert(DependentWrites);
+  assert(CyclesLeft == UNKNOWN_CYCLES);
+
+  // This read may be dependent on more than one write. This typically occurs
+  // when a definition is the result of multiple writes where at least one
+  // write does a partial register update.
+  // The HW is forced to do some extra bookkeeping to track of all the
+  // dependent writes, and implement a merging scheme for the partial writes.
+  --DependentWrites;
+  TotalCycles = std::max(TotalCycles, Cycles);
+
+  if (!DependentWrites)
+    CyclesLeft = TotalCycles;
+}
+
+void WriteState::OnInstructionIssued() {
+  assert(CyclesLeft == UNKNOWN_CYCLES);
+  // Update the number of cycles left based on the WriteDescriptor info.
+  CyclesLeft = WD.Latency;
+
+  // Now that the time left before write-back is know, notify
+  // all the users.
+  for (auto &User : Users) {
+    ReadState *RS = User.first;
+    unsigned ReadCycles = std::max(0, CyclesLeft - User.second);
+    RS->WriteStartEvent(ReadCycles);
+  }
+}
+
+void WriteState::AddUser(ReadState *User, int ReadAdvance) {
+  // If CyclesLeft is different than -1, then we don't need to
+  // update the list of users. We can just notify the user with
+  // the actual number of cycles left (which may be zero).
+  if (CyclesLeft != UNKNOWN_CYCLES) {
+    unsigned ReadCycles = std::max(0, CyclesLeft - ReadAdvance);
+    User->WriteStartEvent(ReadCycles);
+    return;
+  }
+
+  std::pair<ReadState *, int> NewPair(User, ReadAdvance);
+  Users.insert(NewPair);
+}
+
+void WriteState::CycleEvent() {
+  // Note: CyclesLeft can be a negative number. It is an error to
+  // make it an unsigned quantity because users of this write may
+  // specify a negative ReadAdvance.
+  if (CyclesLeft != UNKNOWN_CYCLES)
+    CyclesLeft--;
+}
+
+void ReadState::CycleEvent() {
+  // If CyclesLeft is unknown, then bail out immediately.
+  if (CyclesLeft == UNKNOWN_CYCLES)
+    return;
+
+  // If there are still dependent writes, or we reached cycle zero,
+  // then just exit.
+  if (DependentWrites || CyclesLeft == 0)
+    return;
+
+  CyclesLeft--;
+}
+
+#ifndef NDEBUG
+void WriteState::dump() const {
+  dbgs() << "{ OpIdx=" << WD.OpIndex << ", Lat=" << WD.Latency << ", RegID "
+         << GetRegisterID() << ", Cycles Left=" << GetCyclesLeft() << " }\n";
+}
+#endif
+
+bool Instruction::IsReady() {
+  if (Stage == IS_READY)
+    return true;
+
+  assert(Stage == IS_AVAILABLE);
+  for (const auto &Use : Uses)
+    if (!Use.get()->IsReady())
+      return false;
+
+  SetReady();
+  return true;
+}
+
+void Instruction::Execute() {
+  assert(Stage == IS_READY);
+  Stage = IS_EXECUTING;
+  for (auto &Def : Defs)
+    Def->OnInstructionIssued();
+}
+
+bool Instruction::IsZeroLatency() const {
+  return Desc.MaxLatency == 0 && Defs.size() == 0 && Uses.size() == 0;
+}
+
+void Instruction::CycleEvent() {
+  if (IsDispatched()) {
+    for (auto &Use : Uses)
+      Use->CycleEvent();
+    return;
+  }
+  if (IsExecuting()) {
+    for (auto &Def : Defs)
+      Def->CycleEvent();
+    CyclesLeft--;
+  }
+  if (!CyclesLeft)
+    Stage = IS_EXECUTED;
+}
+
+} // Namespace end.
Index: tools/llvm-mca/LLVMBuild.txt
===================================================================
--- tools/llvm-mca/LLVMBuild.txt
+++ tools/llvm-mca/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./tools/llvm-mc/LLVMBuild.txt ----------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Tool
+name = llvm-mca
+parent = Tools
+required_libraries = MC MCParser Support all-targets
Index: tools/llvm-mca/LSUnit.h
===================================================================
--- tools/llvm-mca/LSUnit.h
+++ tools/llvm-mca/LSUnit.h
@@ -0,0 +1,162 @@
+//===------------------------- LSUnit.h --------------------------*- C++-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// A Load/Store unit class that models load/store queues and that implements
+/// a simple weak memory consistency model.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_LSUNIT_H
+#define LLVM_TOOLS_LLVM_MCA_LSUNIT_H
+
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <set>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace mca {
+
+/// \brief A Load/Store Unit implementing a load and store queues.
+///
+/// This class implements a load queue and a store queue to emulate the
+/// out-of-order execution of memory operations.
+/// Each load (or store) consumes an entry in the load (or store) queue.
+///
+/// Rules are:
+/// 1) A younger load is allowed to pass an older load only if there are no
+///    stores nor barriers in between the two loads.
+/// 2) An younger store is not allowed to pass an older store.
+/// 3) A younger store is not allowed to pass an older load.
+/// 4) A younger load is allowed to pass an older store only if the load does
+///    not alias with the store.
+///
+/// This class conservatively/pessimistically assumes that loads
+/// may-alias store operations. Under this assumption, younger loads are never
+/// allowed to pass older stores (this negatively affects rule 4.).
+///
+/// In order to make it possible for a younger load to pass an older store,
+/// flag `AssumeNoAlias` must be set by the constructor of LSUnit.
+/// Essentially, this LSUnit doesn't attempt to run any sort alias analysis to
+/// predict when loads and stores don't alias with eachother.
+///
+/// In the case of write-combining memory, rule 2. could be relaxed to allow
+/// reordering of non-aliasing store operations. In practice, this class
+/// doesn't allow it.
+/// To put it in another way, there is no option to specify a different memory
+/// type for memory operations (example: write-through, write-combining, etc.).
+/// Also, there is no way to weaken the memory model, and this unit doesn't
+/// support write-combining behavior.
+///
+/// No assumptions are made on the size of the store buffer.
+/// As mentioned before, this class conservatively assumes a may-alias relation
+/// between loads and stores. Consequently, LSUnit doesn't know how to identify
+/// cases where store-to-load forwarding would occur.
+///
+/// LSUnit doesn't attempt to predict whether a load or store hits or misses
+/// the L1 cache. To be more specific, LSUnit doesn't know anything about
+/// the cache hierarchy and memory types.
+/// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the
+/// scheduling model provides an "optimistic" load-to-use latency (which usually
+/// matches the load-to-use latency for when there is a hit in the L1D).
+///
+/// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor
+/// memory-barrier like instructions.
+/// LSUnit conservatively assumes that an instruction which `mayLoad` and has
+/// `unmodeled side effects` behave like a "soft" load-barrier. That means, it
+/// serializes loads without forcing a flush of the load queue.
+/// Similarly, instructions that both `mayStore` and have `unmodeled side
+/// effects` are treated like store barriers. A full memory
+/// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side
+/// effects. This is obviously inaccurate, but this is the best that we can do
+/// at the moment.
+///
+/// Each load/store barrier consumes one entry in the load/store queue. A
+/// load/store barrier enforces ordering of loads/stores:
+///  - A younger load cannot pass a load barrier.
+///  - A younger store cannot pass a store barrier.
+///
+/// A younger load has to wait for the memory load barrier to execute.
+/// A load/store barrier is "executed" when it becomes the oldest entry in
+/// the load/store queue(s). That also means, all the older loads/stores have
+/// already been executed.
+class LSUnit {
+  // Load queue size.
+  // LQ_Size == 0 means that there are infinite slots in the load queue.
+  unsigned LQ_Size;
+
+  // Store queue size.
+  // SQ_Size == 0 means that there are infinite slots in the store queue.
+  unsigned SQ_Size;
+
+  // If true, loads will never alias with stores.
+  bool NoAlias;
+
+  std::set<unsigned> LoadQueue;
+  std::set<unsigned> StoreQueue;
+
+  void AssignLQSlot(unsigned Index);
+  void AssignSQSlot(unsigned Index);
+  bool IsReadyNoAlias(unsigned Index) const;
+
+  // An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is
+  // conservatively treated as a store barrier. It forces older store to be
+  // executed before newer stores are issued.
+  std::set<unsigned> StoreBarriers;
+
+  // An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is
+  // conservatively treated as a load barrier. It forces older loads to execute
+  // before newer loads are issued.
+  std::set<unsigned> LoadBarriers;
+
+public:
+  LSUnit(unsigned LQ = 0, unsigned SQ = 0, bool AssumeNoAlias = false)
+      : LQ_Size(LQ), SQ_Size(SQ), NoAlias(AssumeNoAlias) {}
+
+#ifndef NDEBUG
+  void dump() const;
+#endif
+
+  bool IsSQEmpty() const { return StoreQueue.empty(); }
+  bool IsLQEmpty() const { return LoadQueue.empty(); }
+  bool IsSQFull() const { return SQ_Size != 0 && StoreQueue.size() == SQ_Size; }
+  bool IsLQFull() const { return LQ_Size != 0 && LoadQueue.size() == LQ_Size; }
+
+  void Reserve(unsigned Index, bool MayLoad, bool MayStore, bool IsMemBarrier) {
+    if (!MayLoad && !MayStore)
+      return;
+    if (MayLoad) {
+      if (IsMemBarrier)
+        LoadBarriers.insert(Index);
+      AssignLQSlot(Index);
+    }
+    if (MayStore) {
+      if (IsMemBarrier)
+        StoreBarriers.insert(Index);
+      AssignSQSlot(Index);
+    }
+  }
+
+  // The rules are:
+  // 1. A store may not pass a previous store.
+  // 2. A load may not pass a previous store unless flag 'NoAlias' is set.
+  // 3. A load may pass a previous load.
+  // 4. A store may not pass a previous load (regardless of flag 'NoAlias').
+  // 5. A load has to wait until an older load barrier is fully executed.
+  // 6. A store has to wait until an older store barrier is fully executed.
+  bool IsReady(unsigned Index) const;
+  void OnInstructionExecuted(unsigned Index);
+};
+
+} // namespace mca
+
+#endif
Index: tools/llvm-mca/LSUnit.cpp
===================================================================
--- tools/llvm-mca/LSUnit.cpp
+++ tools/llvm-mca/LSUnit.cpp
@@ -0,0 +1,115 @@
+//===----------------------- LSUnit.cpp --------------------------*- C++-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// A Load-Store Unit for the llvm-mca tool.
+///
+//===----------------------------------------------------------------------===//
+
+#include "LSUnit.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace mca {
+
+#ifndef NDEBUG
+void LSUnit::dump() const {
+  dbgs() << "[LSUnit] LQ_Size = " << LQ_Size << '\n';
+  dbgs() << "[LSUnit] SQ_Size = " << SQ_Size << '\n';
+  dbgs() << "[LSUnit] NextLQSlotIdx = " << LoadQueue.size() << '\n';
+  dbgs() << "[LSUnit] NextSQSlotIdx = " << StoreQueue.size() << '\n';
+}
+#endif
+
+void LSUnit::AssignLQSlot(unsigned Index) {
+  assert(!IsLQFull());
+  assert(LoadQueue.count(Index) == 0);
+
+  DEBUG(dbgs() << "[LSUnit] - AssignLQSlot <Idx=" << Index
+               << ",slot=" << LoadQueue.size() << ">\n");
+  LoadQueue.insert(Index);
+}
+
+void LSUnit::AssignSQSlot(unsigned Index) {
+  assert(!IsSQFull());
+  assert(StoreQueue.count(Index) == 0);
+
+  DEBUG(dbgs() << "[LSUnit] - AssignSQSlot <Idx=" << Index
+               << ",slot=" << StoreQueue.size() << ">\n");
+  StoreQueue.insert(Index);
+}
+
+bool LSUnit::IsReady(unsigned Index) const {
+  bool IsALoad = LoadQueue.count(Index) != 0;
+  bool IsAStore = StoreQueue.count(Index) != 0;
+  unsigned LoadBarrierIndex = LoadBarriers.empty() ? 0 : *LoadBarriers.begin();
+  unsigned StoreBarrierIndex = StoreBarriers.empty() ? 0 : *StoreBarriers.begin();
+
+  if (IsALoad && LoadBarrierIndex) {
+    if (Index > LoadBarrierIndex)
+      return false;
+    if (Index == LoadBarrierIndex && Index != *LoadQueue.begin())
+      return false;
+  }
+
+  if (IsAStore && StoreBarrierIndex) {
+    if (Index > StoreBarrierIndex)
+      return false;
+    if (Index == StoreBarrierIndex && Index != *StoreQueue.begin())
+      return false;
+  }
+
+  if (NoAlias && IsALoad)
+    return true;
+
+  if (StoreQueue.size()) {
+    // Check if this memory operation is younger than the older store.
+    if (Index > *StoreQueue.begin())
+      return false;
+  }
+
+  // Okay, we are older than the oldest store in the queue.
+  // If there are no pending loads, then we can say for sure that this
+  // instruction is ready.
+  if (IsLQEmpty())
+    return true;
+
+  // Check if there are no older loads.
+  if (Index <= *LoadQueue.begin())
+    return true;
+
+  // There is at least one younger load.
+  return !IsAStore;
+}
+
+void LSUnit::OnInstructionExecuted(unsigned Index) {
+  std::set<unsigned>::iterator it = LoadQueue.find(Index);
+  if (it != LoadQueue.end()) {
+    DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index
+                 << " has been removed from the load queue.\n");
+    LoadQueue.erase(it);
+  }
+
+  it = StoreQueue.find(Index);
+  if (it != StoreQueue.end()) {
+    DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index
+                 << " has been removed from the store queue.\n");
+    StoreQueue.erase(it);
+  }
+
+  if (!StoreBarriers.empty() && Index == *StoreBarriers.begin())
+    StoreBarriers.erase(StoreBarriers.begin());
+  if (!LoadBarriers.empty() && Index == *LoadBarriers.begin())
+    LoadBarriers.erase(LoadBarriers.begin());
+}
+} // namespace mca
Index: tools/llvm-mca/ResourcePressureView.h
===================================================================
--- tools/llvm-mca/ResourcePressureView.h
+++ tools/llvm-mca/ResourcePressureView.h
@@ -0,0 +1,115 @@
+//===--------------------- ResourcePressureView.h ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file define class ResourcePressureView.
+/// Class ResourcePressureView observes hardware events generated by
+/// the Backend object and collects statistics related to resource usage at
+/// instruction granularity.
+/// Resource pressure information is then printed out to a stream in the
+/// form of a table like the one from the example below:
+///
+/// Resources:
+/// [0] - JALU0
+/// [1] - JALU1
+/// [2] - JDiv
+/// [3] - JFPM
+/// [4] - JFPU0
+/// [5] - JFPU1
+/// [6] - JLAGU
+/// [7] - JSAGU
+/// [8] - JSTC
+/// [9] - JVIMUL
+///
+/// Resource pressure per iteration:
+/// [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+/// 0.00   0.00   0.00   0.00   2.00   2.00   0.00   0.00   0.00   0.00
+///
+/// Resource pressure by instruction:
+/// [0]  [1]  [2]  [3]  [4]  [5]  [6]  [7]  [8]  [9]  Instructions:
+///  -    -    -    -    -   1.00  -    -    -    -   vpermilpd  $1,    %xmm0,
+///  %xmm1
+///  -    -    -    -   1.00  -    -    -    -    -   vaddps     %xmm0, %xmm1,
+///  %xmm2
+///  -    -    -    -    -   1.00  -    -    -    -   vmovshdup  %xmm2, %xmm3
+///  -    -    -    -   1.00  -    -    -    -    -   vaddss     %xmm2, %xmm3,
+///  %xmm4
+///
+/// In this example, we have AVX code executed on AMD Jaguar (btver2).
+/// Both shuffles and vector floating point add operations on XMM registers have
+/// a reciprocal throughput of 1cy.
+/// Each add is issued to pipeline JFPU0, while each shuffle is issued to
+/// pipeline JFPU1. The overall pressure per iteration is reported by two
+/// tables: the first smaller table is the resource pressure per iteration;
+/// the second table reports resource pressure per instruction. Values are the
+/// average resource cycles consumed by an instruction.
+/// Every vector add from the example uses resource JFPU0 for an average of 1cy
+/// per iteration. Consequently, the resource pressure on JFPU0 is of 2cy per
+/// iteration.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_RESOURCEPRESSUREVIEW_H
+#define LLVM_TOOLS_LLVM_MCA_RESOURCEPRESSUREVIEW_H
+
+#include "HWEventListener.h"
+#include "SourceMgr.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include <map>
+
+namespace mca {
+
+using namespace llvm;
+
+class Backend;
+
+/// This class collects resource pressure statistics and it is able to print
+/// out all the collected information as a table to an output stream.
+class ResourcePressureView : public HWEventListener {
+  const MCSubtargetInfo &STI;
+  MCInstPrinter &MCIP;
+  const SourceMgr &Source;
+
+  // Map to quickly get a resource descriptor from a mask.
+  std::map<uint64_t, unsigned> Resource2VecIndex;
+
+  // Table of resources used by instructions.
+  std::vector<unsigned> ResourceUsage;
+  unsigned NumResourceUnits;
+
+  const MCInst &GetMCInstFromIndex(unsigned Index) const;
+  void printResourcePressurePerIteration(raw_ostream &OS,
+                                         unsigned Executions) const;
+  void printResourcePressurePerInstruction(raw_ostream &OS,
+                                           unsigned Executions) const;
+  void initialize(const ArrayRef<uint64_t> ProcResoureMasks);
+
+public:
+  ResourcePressureView(const MCSubtargetInfo &ST, MCInstPrinter &Printer,
+                       const SourceMgr &SM,
+                       const ArrayRef<uint64_t> ProcResourceMasks)
+      : STI(ST), MCIP(Printer), Source(SM) {
+    initialize(ProcResourceMasks);
+  }
+
+  void OnInstructionIssued(
+      unsigned Index,
+      const ArrayRef<std::pair<ResourceRef, unsigned>> &Used) override;
+
+  void printResourcePressure(raw_ostream &OS, unsigned Cycles) const {
+    unsigned Executions = Source.GetNumIterations();
+    printResourcePressurePerIteration(OS, Executions);
+    printResourcePressurePerInstruction(OS, Executions);
+  }
+};
+
+} // namespace mca
+
+#endif
Index: tools/llvm-mca/ResourcePressureView.cpp
===================================================================
--- tools/llvm-mca/ResourcePressureView.cpp
+++ tools/llvm-mca/ResourcePressureView.cpp
@@ -0,0 +1,174 @@
+//===--------------------- ResourcePressureView.cpp -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements methods in the ResourcePressureView interface.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ResourcePressureView.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mca {
+
+void ResourcePressureView::initialize(
+    const ArrayRef<uint64_t> ProcResourceMasks) {
+  // Populate the map of resource descriptors.
+  unsigned R2VIndex = 0;
+  const MCSchedModel &SM = STI.getSchedModel();
+  for (unsigned i = 0; i < SM.getNumProcResourceKinds(); ++i) {
+    const MCProcResourceDesc &ProcResource = *SM.getProcResource(i);
+    unsigned NumUnits = ProcResource.NumUnits;
+    // Skip groups and invalid resources with zero units.
+    if (ProcResource.SubUnitsIdxBegin || !NumUnits)
+      continue;
+
+    uint64_t ResourceMask = ProcResourceMasks[i];
+    Resource2VecIndex.insert(
+        std::pair<uint64_t, unsigned>(ResourceMask, R2VIndex));
+    R2VIndex += ProcResource.NumUnits;
+  }
+
+  NumResourceUnits = R2VIndex;
+  ResourceUsage.resize(NumResourceUnits * (Source.size() + 1));
+  std::fill(ResourceUsage.begin(), ResourceUsage.end(), 0);
+}
+
+void ResourcePressureView::OnInstructionIssued(
+    unsigned Index, const ArrayRef<std::pair<ResourceRef, unsigned>> &Used) {
+  unsigned SourceIdx = Index % Source.size();
+  for (const auto &Use : Used) {
+    const ResourceRef &RR = Use.first;
+    assert(Resource2VecIndex.find(RR.first) != Resource2VecIndex.end());
+    unsigned R2VIndex = Resource2VecIndex[RR.first];
+    R2VIndex += countTrailingZeros(RR.second);
+    ResourceUsage[R2VIndex + NumResourceUnits * SourceIdx] += Use.second;
+    ResourceUsage[R2VIndex + NumResourceUnits * Source.size()] += Use.second;
+  }
+}
+
+static void printColumnNames(raw_string_ostream &OS, const MCSchedModel &SM) {
+  for (unsigned i = 1, ResourceIndex = 0; i < SM.getNumProcResourceKinds();
+       ++i) {
+    const MCProcResourceDesc &ProcResource = *SM.getProcResource(i);
+    unsigned NumUnits = ProcResource.NumUnits;
+    // Skip groups and invalid resources with zero units.
+    if (ProcResource.SubUnitsIdxBegin || !NumUnits)
+      continue;
+
+    if (NumUnits == 1) {
+      OS << '[' << ResourceIndex << ']';
+      if (ResourceIndex < 10)
+        OS << "    ";
+      else
+        OS << "   ";
+      ResourceIndex++;
+      continue;
+    }
+
+    for (unsigned j = 0; j < NumUnits; ++j) {
+      OS << "[" << ResourceIndex << '.' << j << ']';
+      if (ResourceIndex < 10)
+        OS << "  ";
+      else
+        OS << ' ';
+    }
+    ResourceIndex++;
+  }
+}
+
+void ResourcePressureView::printResourcePressurePerIteration(
+    raw_ostream &OS, unsigned Executions) const {
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+
+  TempStream << "\n\nResources:\n";
+  const MCSchedModel &SM = STI.getSchedModel();
+  for (unsigned i = 0, ResourceIndex = 0; i < SM.getNumProcResourceKinds();
+       ++i) {
+    const MCProcResourceDesc &ProcResource = *SM.getProcResource(i);
+    unsigned NumUnits = ProcResource.NumUnits;
+    // Skip groups and invalid resources with zero units.
+    if (ProcResource.SubUnitsIdxBegin || !NumUnits)
+      continue;
+
+    for (unsigned j = 0; j < NumUnits; ++j) {
+      TempStream << '[' << ResourceIndex;
+      if (NumUnits > 1)
+        TempStream << '.' << j;
+      TempStream << "] - " << ProcResource.Name << '\n';
+    }
+
+    ResourceIndex++;
+  }
+
+  TempStream << "\n\nResource pressure per iteration:\n";
+  printColumnNames(TempStream, SM);
+  TempStream << '\n';
+
+  for (unsigned j = 0; j < NumResourceUnits; ++j) {
+    unsigned Usage = ResourceUsage[j + Source.size() * NumResourceUnits];
+    if (!Usage) {
+      TempStream << " -     ";
+      continue;
+    }
+
+    double Pressure = (double)Usage / Executions;
+    TempStream << format("%.2f", Pressure);
+    if (Pressure < 10.0)
+      TempStream << "   ";
+    else if (Pressure < 100.0)
+      TempStream << "  ";
+    else
+      TempStream << ' ';
+  }
+
+  TempStream.flush();
+  OS << Buffer;
+}
+
+void ResourcePressureView::printResourcePressurePerInstruction(
+    raw_ostream &OS, unsigned Executions) const {
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+
+  TempStream << "\n\nResource pressure by instruction:\n";
+  printColumnNames(TempStream, STI.getSchedModel());
+  TempStream << "\tInstructions:\n";
+
+  for (unsigned i = 0; i < Source.size(); ++i) {
+    for (unsigned j = 0; j < NumResourceUnits; ++j) {
+      unsigned Usage = ResourceUsage[j + i * NumResourceUnits];
+      if (Usage == 0) {
+        TempStream << " -     ";
+      } else {
+        double Pressure = (double)Usage / Executions;
+        if (Pressure < 0.005) {
+          TempStream << " -     ";
+        } else {
+          TempStream << format("%.2f", Pressure);
+          if (Pressure < 10.0)
+            TempStream << "   ";
+          else if (Pressure < 100.0)
+            TempStream << "  ";
+          else
+            TempStream << ' ';
+        }
+      }
+    }
+
+    MCIP.printInst(&Source.GetMCInstFromIndex(i), TempStream, "", STI);
+    TempStream << '\n';
+    TempStream.flush();
+    OS << Buffer;
+    Buffer = "";
+  }
+}
+
+} // namespace mca
Index: tools/llvm-mca/Scheduler.h
===================================================================
--- tools/llvm-mca/Scheduler.h
+++ tools/llvm-mca/Scheduler.h
@@ -0,0 +1,522 @@
+//===--------------------- Scheduler.h ------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// A scheduler for Processor Resource Units and Processor Resource Groups.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_SCHEDULER_H
+#define LLVM_TOOLS_LLVM_MCA_SCHEDULER_H
+
+#include "Instruction.h"
+#include "LSUnit.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include <map>
+
+namespace mca {
+
+using namespace llvm;
+
+class Backend;
+
+enum ResourceStateEvent {
+  RS_BUFFER_AVAILABLE,
+  RS_BUFFER_UNAVAILABLE,
+  RS_RESERVED
+};
+
+/// \brief A descriptor for processor resources.
+///
+/// Each object of class ResourceState is associated to a specific processor
+/// resource. There is an instance of this class for every processor resource
+/// defined by the scheduling model.
+/// A ResourceState dynamically tracks the availability of units of a processor
+/// resource. For example, the ResourceState of a ProcResGroup tracks the
+/// availability of resource units which are part of the group.
+///
+/// Internally, ResourceState uses a round-robin selector to identify
+/// which unit of the group shall be used next.
+class ResourceState {
+  // A resource identifier.
+  // If this ResourceState describes a group, then the number of bits
+  // set in this mask is equivalent to the number of units of the group.
+  // A ResourceMask can be used to quickly check if a unit is part of a group
+  // or not.
+  //
+  // Field ResourceMask has exactly one bit set if this object describes a
+  // simple ProcResource. For ProcResource only, this field can be used as a
+  // unique identifer.
+  // Field ResourceMask for resource groups is obtained by OR'ing
+  // the value of ResourceMask of every ProcResourceUnits in the group.
+  uint64_t ResourceMask;
+
+  // A ProcResource can specify a number of units. A ProcResource with more
+  // than one unit behaves like a group.
+  // This field has one bit set for each resource in the group.
+  // Note that in this file, we often call sub-resource a resource which is
+  // part of a group.
+  // For ProcResGroup, this field defaults to 'ResourceMask'. For non-group
+  // resources, the number of bits set in this mask is equivalent to the
+  // number of units (i.e. field 'NumUnits' in 'ProcResourceUnits').
+  uint64_t ResourceSizeMask;
+
+  // A simple round-robin selector for processor resources.
+  // Each bit of the mask identifies a sub resource within this group.
+  //
+  // As an example, lets assume that this ResourceState describes a
+  // processor resource group composed of the following three units:
+  //   ResourceA -- 0b001
+  //   ResourceB -- 0b010
+  //   ResourceC -- 0b100
+  //
+  // Each unit is identified by a ResourceMask which always contains a
+  // single bit set. Field NextInSequenceMask is initially set to value
+  // 0xb111. That value is obtained by OR'ing the resource masks of
+  // processor resource that are part of the group.
+  //
+  //   NextInSequenceMask  -- 0b111
+  //
+  // Field NextInSequenceMask is used by the resource manager (i.e.
+  // an object of class ResourceManager) to select the "next available resource"
+  // from the set. The algorithm would prioritize resources with a bigger
+  // ResourceMask value.
+  //
+  // In this example, there are three resources in the set, and 'ResourceC'
+  // has the highest mask value. The round-robin selector would firstly select
+  //  'ResourceC', then 'ResourceB', and eventually 'ResourceA'.
+  //
+  // When a resource R is used, its corresponding bit is cleared from the set.
+  //
+  // Back to the example:
+  // If 'ResourceC' is selected, then the new value of NextInSequenceMask
+  // becomes 0xb011.
+  //
+  // When NextInSequenceMask becomes zero, it is reset to its original value
+  // (in this example, that value would be 0b111).
+  uint64_t NextInSequenceMask;
+
+  // Some instructions can only be issued on very specific pipeline resources.
+  // For those instructions, we know exactly which resource would be consumed
+  // without having to dynamically select it using field 'NextInSequenceMask'.
+  //
+  // The resource mask bit associated to the (statically) selected
+  // processor resource is still cleared from the 'NextInSequenceMask'.
+  // If that bit was already zero in NextInSequenceMask, then we update
+  // mask 'RemovedFromNextInSequence'.
+  //
+  // When NextInSequenceMask is reset back to its initial value, the algorithm
+  // removes any bits which are set in RemoveFromNextInSequence.
+  uint64_t RemovedFromNextInSequence;
+
+  // A mask of ready units.
+  uint64_t ReadyMask;
+
+  // Buffered resources will have this field set to a positive number bigger
+  // than 0. A buffered resource behaves like a separate reservation station
+  // implementing its own buffer for out-of-order execution.
+  // A buffer of 1 is for units that force in-order execution.
+  // A value of 0 is treated specially. In particular, a resource with
+  // A BufferSize = 0 is for an in-order issue/dispatch resource.
+  // That means, this resource is reserved starting from the dispatch event,
+  // until all the "resource cycles" are consumed after the issue event.
+  // While this resource is reserved, no other instruction may be dispatched.
+  int BufferSize;
+
+  // Available slots in the buffer (zero, if this is not a buffered resource).
+  unsigned AvailableSlots;
+
+  // Maximum number of buffer slots seen used during one cycle.
+  // This helps tracking dynamic dispatch stalls caused by the lack of
+  // entries in the scheduler's queue.
+  unsigned MaxUsedSlots;
+
+  // True if this is resource is currently unavailable.
+  // An instruction may "reserve" a resource for a number of cycles.
+  // During those cycles, the reserved resource cannot be used for other
+  // instructions, even if the ReadyMask is set.
+  bool Unavailable;
+
+  bool IsSubResourceReady(uint64_t ID) const { return ReadyMask & ID; }
+
+  /// Returns the mask identifier of the next available resource in the set.
+  uint64_t GetNextInSequence() const {
+    assert(NextInSequenceMask);
+    return PowerOf2Floor(NextInSequenceMask);
+  }
+
+  /// Returns the mask of the next available resource within the set,
+  /// and updates the resource selector.
+  void UpdateNextInSequence() {
+    NextInSequenceMask ^= GetNextInSequence();
+    if (!NextInSequenceMask)
+      NextInSequenceMask = ResourceSizeMask;
+  }
+
+  uint64_t ComputeResourceSizeMaskForGroup(uint64_t ResourceMask) {
+    assert(countPopulation(ResourceMask) > 1);
+    return ResourceMask ^ PowerOf2Floor(ResourceMask);
+  }
+
+public:
+  ResourceState(const MCProcResourceDesc &Desc, uint64_t Mask)
+      : ResourceMask(Mask) {
+    bool IsAGroup = countPopulation(ResourceMask) > 1;
+    ResourceSizeMask = IsAGroup ? ComputeResourceSizeMaskForGroup(ResourceMask)
+                                : ((1ULL << Desc.NumUnits) - 1);
+    NextInSequenceMask = ResourceSizeMask;
+    RemovedFromNextInSequence = 0;
+    ReadyMask = ResourceSizeMask;
+    BufferSize = Desc.BufferSize;
+    AvailableSlots = BufferSize == -1 ? 0U : static_cast<unsigned>(BufferSize);
+    MaxUsedSlots = 0;
+    Unavailable = false;
+  }
+
+  uint64_t GetResourceMask() const { return ResourceMask; }
+  int GetBufferSize() const { return BufferSize; }
+  unsigned GetMaxUsedSlots() const { return MaxUsedSlots; }
+
+  bool IsBuffered() const { return BufferSize > 0; }
+  bool IsInOrder() const { return BufferSize == 1; }
+  bool IsADispatchHazard() const { return BufferSize == 0; }
+  bool IsReserved() const { return Unavailable; }
+
+  void SetReserved() { Unavailable = true; }
+  void ClearReserved() { Unavailable = false; }
+
+  // A resource is ready if it is not reserved, and if there are enough
+  // available units.
+  // If a resource is also a dispatch hazard, then we don't check if
+  // it is reserved because that check would always return true.
+  // A resource marked as "dispatch hazard" is always reserved at
+  // dispatch time. When this method is called, the assumption is that
+  // the user of this resource has been already dispatched.
+  bool IsReady(unsigned NumUnits = 1) const {
+    return (!IsReserved() || IsADispatchHazard()) &&
+           countPopulation(ReadyMask) >= NumUnits;
+  }
+  bool IsAResourceGroup() const { return countPopulation(ResourceMask) > 1; }
+
+  bool ContainsResource(uint64_t ID) const { return ResourceMask & ID; }
+
+  void MarkSubResourceAsUsed(uint64_t ID) {
+    assert(IsSubResourceReady(ID));
+    ReadyMask ^= ID;
+  }
+
+  void ReleaseSubResource(uint64_t ID) {
+    assert(!IsSubResourceReady(ID));
+    ReadyMask ^= ID;
+  }
+
+  unsigned GetNumUnits() const {
+    return IsAResourceGroup() ? 1U : countPopulation(ResourceSizeMask);
+  }
+
+  uint64_t SelectNextInSequence();
+  void RemoveFromNextInSequence(uint64_t ID);
+
+  ResourceStateEvent IsBufferAvailable() const {
+    if (IsADispatchHazard() && IsReserved())
+      return RS_RESERVED;
+    if (!IsBuffered() || AvailableSlots)
+      return RS_BUFFER_AVAILABLE;
+    return RS_BUFFER_UNAVAILABLE;
+  }
+
+  void ReserveBuffer() {
+    if (AvailableSlots)
+      AvailableSlots--;
+    unsigned UsedSlots = static_cast<unsigned>(BufferSize) - AvailableSlots;
+    MaxUsedSlots = std::max(MaxUsedSlots, UsedSlots);
+  }
+
+  void ReleaseBuffer() {
+    if (BufferSize > 0)
+      AvailableSlots++;
+    assert(AvailableSlots <= static_cast<unsigned>(BufferSize));
+  }
+
+#ifndef NDEBUG
+  void dump() const;
+#endif
+};
+
+/// \brief A resource unit identifier.
+///
+/// This is used to identify a specific processor resource unit using a pair
+/// of indices where the 'first' index is a processor resource mask, and the
+/// 'second' index is an index for a "sub-resource" (i.e. unit).
+typedef std::pair<uint64_t, uint64_t> ResourceRef;
+
+// First: a resource mask identifying a buffered resource.
+// Second: max number of buffer entries used in this resource.
+typedef std::pair<uint64_t, unsigned> BufferUsageEntry;
+
+/// A resource manager for processor resource units and groups.
+///
+/// This class owns all the ResourceState objects, and it is responsible for
+/// acting on requests from a Scheduler by updating the internal state of
+/// ResourceState objects.
+/// This class doesn't know about instruction itineraries and functional units.
+/// In future, it can be extended to support itineraries too through the same
+/// public interface.
+class ResourceManager {
+  // The resource manager owns all the ResourceState.
+  SmallDenseMap<uint64_t, std::unique_ptr<ResourceState>> Resources;
+
+  // Keeps track of which resources are busy, and how many cycles are left
+  // before those become usable again.
+  SmallDenseMap<ResourceRef, unsigned> BusyResources;
+
+  // A table to map processor resource IDs to processor resource masks.
+  SmallVector<uint64_t, 8> ProcResID2Mask;
+
+  // Adds a new resource state in Resources, as well as a new descriptor in
+  // ResourceDescriptor.
+  void AddResource(const MCProcResourceDesc &Desc, uint64_t Mask);
+
+  // Compute processor resource masks for each processor resource declared by
+  // the scheduling model.
+  void ComputeProcResourceMasks(const MCSchedModel &SM);
+
+  // Populate resource descriptors.
+  void initialize(const MCSchedModel &SM) {
+    ComputeProcResourceMasks(SM);
+    for (unsigned i = 0; i < SM.getNumProcResourceKinds(); ++i)
+      AddResource(*SM.getProcResource(i), ProcResID2Mask[i]);
+  }
+
+  // Returns the actual resource unit that will be used.
+  ResourceRef SelectPipe(uint64_t ResourceID);
+
+  void Use(ResourceRef RR);
+  void Release(ResourceRef RR);
+
+  unsigned GetNumUnits(uint64_t ResourceID) const {
+    assert(Resources.find(ResourceID) != Resources.end());
+    return Resources.find(ResourceID)->getSecond()->GetNumUnits();
+  }
+
+  // Reserve a specific Resource kind.
+  void ReserveBuffer(uint64_t ResourceID) {
+    assert(IsBufferAvailable(ResourceID) ==
+           ResourceStateEvent::RS_BUFFER_AVAILABLE);
+    ResourceState &Resource = *Resources[ResourceID];
+    Resource.ReserveBuffer();
+  }
+
+  void ReleaseBuffer(uint64_t ResourceID) {
+    Resources[ResourceID]->ReleaseBuffer();
+  }
+
+  ResourceStateEvent IsBufferAvailable(uint64_t ResourceID) const {
+    const ResourceState &Resource = *Resources.find(ResourceID)->second;
+    return Resource.IsBufferAvailable();
+  }
+
+  bool IsReady(uint64_t ResourceID, unsigned NumUnits) const {
+    const ResourceState &Resource = *Resources.find(ResourceID)->second;
+    return Resource.IsReady(NumUnits);
+  }
+
+public:
+  ResourceManager(const MCSchedModel &SM) { initialize(SM); }
+
+  ResourceStateEvent CanBeDispatched(const InstrDesc &Desc) const {
+    ResourceStateEvent Result = ResourceStateEvent::RS_BUFFER_AVAILABLE;
+    for (uint64_t Buffer : Desc.Buffers) {
+      Result = IsBufferAvailable(Buffer);
+      if (Result != ResourceStateEvent::RS_BUFFER_AVAILABLE)
+        break;
+    }
+
+    return Result;
+  }
+
+  void ReserveBuffers(const InstrDesc &Desc) {
+    for (const auto &R : Desc.Buffers)
+      ReserveBuffer(R);
+  }
+
+  void ReleaseBuffers(const InstrDesc &Desc) {
+    for (const auto &R : Desc.Buffers)
+      ReleaseBuffer(R);
+  }
+
+  void ReserveResource(uint64_t ResourceID) {
+    ResourceState &Resource = *Resources[ResourceID];
+    assert(!Resource.IsReserved());
+    Resource.SetReserved();
+  }
+
+  void ReleaseResource(uint64_t ResourceID) {
+    ResourceState &Resource = *Resources[ResourceID];
+    Resource.ClearReserved();
+  }
+
+  void ReserveDispatchHazardResources(const InstrDesc &Desc);
+
+  // Returns true if all resources are in-order, and there is at least one
+  // resource which is a dispatch hazard (BufferSize = 0).
+  bool MustIssueImmediately(const InstrDesc &Desc);
+
+  bool CanBeIssued(const InstrDesc &Desc) const;
+  double GetRThroughput(const InstrDesc &Desc) const;
+
+  void
+  IssueInstruction(unsigned Index, const InstrDesc &Desc,
+                   SmallVector<std::pair<ResourceRef, unsigned>, 4> &Pipes);
+
+  void CycleEvent(SmallVector<ResourceRef, 8> &ResourcesFreed);
+
+  void getBuffersUsage(std::vector<BufferUsageEntry> &Usage) const {
+    for (const auto &Resource : Resources) {
+      const ResourceState &RS = *Resource.second;
+      if (RS.IsBuffered())
+        Usage.emplace_back(std::pair<uint64_t, unsigned>(RS.GetResourceMask(),
+                                                         RS.GetMaxUsedSlots()));
+    }
+  }
+
+  const ArrayRef<uint64_t> GetProcResMasks() const { return ProcResID2Mask; }
+
+#ifndef NDEBUG
+  void dump() const {
+    for (const auto &Resource : Resources)
+      Resource.second->dump();
+  }
+#endif
+}; // namespace mca
+
+/// Class Scheduler is responsible for issuing instructions to pipeline
+/// resources. Internally, it delegates to a ResourceManager the management of
+/// processor resources.
+/// This class is also responsible for tracking the progress of instructions
+/// from the dispatch stage, until the write-back stage.
+///
+/// An nstruction dispatched to the Scheduler is initially placed into either
+/// the 'WaitQueue' or the 'ReadyQueue' depending on the availability of the input
+/// operands.
+/// Instructions in the WaitQueue are ordered by instruction index.
+/// An instruction is moved from the WaitQueue to the ReadyQueue when register
+/// operands become available, and all memory dependencies are met.
+/// Instructions that are moved from the WaitQueue to the ReadyQueue transition
+/// from state 'IS_AVAILABLE' to state 'IS_READY'.
+///
+/// At the beginning of each cycle, the Scheduler checks if there are
+/// instructions in the WaitQueue that can be moved to the ReadyQueue.  If the
+/// ReadyQueue is not empty, then older instructions from the queue are issued to
+/// the processor pipelines, and the underlying ResourceManager is updated
+/// accordingly.  The ReadyQueue is ordered by instruction index to guarantee
+/// that the first instructions in the set are also the oldest.
+///
+/// An Instruction is moved from the ReadyQueue the `IssuedQueue` when it is
+/// issued to a (one or more) pipeline(s). This event also causes an instruction
+/// state transition (i.e. from state IS_READY, to state IS_EXECUTING).
+/// An Instruction leaves the IssuedQueue when it reaches the write-back stage.
+class Scheduler {
+  const MCSchedModel &SM;
+
+  // Hardware resources that are managed by this scheduler.
+  std::unique_ptr<ResourceManager> Resources;
+  std::unique_ptr<LSUnit> LSU;
+
+  // The Backend gets notified when instructions are ready/issued/executed.
+  Backend *Owner;
+
+  std::map<unsigned, Instruction *> WaitQueue;
+  std::map<unsigned, Instruction *> ReadyQueue;
+  std::map<unsigned, Instruction *> IssuedQueue;
+
+  void NotifyInstructionIssued(
+      unsigned Index, const ArrayRef<std::pair<ResourceRef, unsigned>> &Used);
+  void NotifyInstructionExecuted(unsigned Index);
+  void NotifyInstructionReady(unsigned Index);
+  void NotifyResourceAvailable(const ResourceRef &RR);
+
+  /// Issue instructions from the ready queue by giving priority to older
+  /// instructions.
+  void Issue();
+
+  /// Issue an instruction without updating the ready queue.
+  void IssueInstruction(Instruction *IS, unsigned InstrIndex);
+  void UpdatePendingQueue();
+  void UpdateIssuedQueue();
+
+public:
+  Scheduler(Backend *B, const MCSchedModel &Model, unsigned LoadQueueSize,
+            unsigned StoreQueueSize, bool AssumeNoAlias)
+      : SM(Model), Resources(llvm::make_unique<ResourceManager>(SM)),
+        LSU(llvm::make_unique<LSUnit>(LoadQueueSize, StoreQueueSize,
+                                      AssumeNoAlias)),
+        Owner(B) {}
+
+  /// Scheduling events.
+  ///
+  /// The DispatchUnit is responsible for querying the Scheduler before
+  /// dispatching new instructions. Queries are performed through method
+  /// `Scheduler::CanBeDispatched`, which returns an instance of this enum to
+  /// tell if the dispatch would fail or not.  If scheduling resources are
+  /// available, and the instruction can be dispatched, then the query returns
+  /// HWS_AVAILABLE.  A values different than HWS_AVAILABLE means that the
+  /// instruction cannot be dispatched during this cycle.
+  ///
+  /// Each event name starts with prefix "HWS_", and it is followed by
+  /// a substring which describes the reason why the Scheduler was unavailable
+  /// (or "AVAILABLE" if the instruction is allowed to be dispatched).
+  ///
+  /// HWS_QUEUE_UNAVAILABLE is returned if there are not enough available slots
+  /// in the  scheduler's queue. That means, one (or more) buffered resources
+  /// consumed by the instruction were full.
+  ///
+  /// HWS_LD_QUEUE_UNAVAILABLE is returned when the instruction 'mayLoad', and
+  /// the load queue in the load/store unit (implemented by class LSUnit) is
+  /// full.  Similarly, HWS_ST_QUEUE_UNAVAILABLE is returned when the store queue
+  /// is full, and the instruction to be dispatched 'mayStore'.
+  ///
+  /// HWS_DISPATCH_GROUP_RESTRICTION is only returned in special cases where the
+  /// instruction consumes an in-order issue/dispatch resource (i.e. a resource
+  /// with `BufferSize=0`), and the resource is not ready at dispatch
+  /// stage.
+  enum Event {
+    HWS_AVAILABLE,
+    HWS_QUEUE_UNAVAILABLE,
+    HWS_DISPATCH_GROUP_RESTRICTION,
+    HWS_LD_QUEUE_UNAVAILABLE,
+    HWS_ST_QUEUE_UNAVAILABLE
+  };
+
+  Event CanBeDispatched(const InstrDesc &Desc) const;
+  Instruction *ScheduleInstruction(unsigned Idx, Instruction *MCIS);
+
+  double GetRThroughput(const InstrDesc &Desc) const {
+    return Resources->GetRThroughput(Desc);
+  }
+
+  void CycleEvent(unsigned Cycle);
+
+  void GetBuffersUsage(std::vector<BufferUsageEntry> &Usage) const {
+    Resources->getBuffersUsage(Usage);
+  }
+
+  const ArrayRef<uint64_t> GetProcResourceMasks() const {
+    return Resources->GetProcResMasks();
+  }
+
+#ifndef NDEBUG
+  void dump() const;
+#endif
+};
+
+} // Namespace mca
+
+#endif
Index: tools/llvm-mca/Scheduler.cpp
===================================================================
--- tools/llvm-mca/Scheduler.cpp
+++ tools/llvm-mca/Scheduler.cpp
@@ -0,0 +1,454 @@
+//===--------------------- Scheduler.cpp ------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// A scheduler for processor resource units and processor resource groups.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Scheduler.h"
+#include "Backend.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace mca {
+
+uint64_t ResourceState::SelectNextInSequence() {
+  assert(IsReady());
+  uint64_t Next = GetNextInSequence();
+  while (!IsSubResourceReady(Next)) {
+    UpdateNextInSequence();
+    Next = GetNextInSequence();
+  }
+  return Next;
+}
+
+#ifndef NDEBUG
+void ResourceState::dump() const {
+  dbgs() << "MASK: " << ResourceMask << ", SIZE_MASK: " << ResourceSizeMask
+         << ", NEXT: " << NextInSequenceMask << ", RDYMASK: " << ReadyMask
+         << ", BufferSize=" << BufferSize
+         << ", AvailableSlots=" << AvailableSlots
+         << ", Reserved=" << Unavailable << '\n';
+}
+#endif
+
+// Adds a new resource state in Resources, as well as a new descriptor in
+// ResourceDescriptor. Map 'Resources' allows to quickly obtain ResourceState
+// objects from resource mask identifiers.
+void ResourceManager::AddResource(const MCProcResourceDesc &Desc,
+                                  uint64_t Mask) {
+  assert(Resources.find(Mask) == Resources.end() && "Resource already added!");
+  Resources[Mask] = llvm::make_unique<ResourceState>(Desc, Mask);
+}
+
+// Populate vector ProcResID2Mask with resource masks. One per each processor
+// resource declared by the scheduling model.
+void ResourceManager::ComputeProcResourceMasks(const MCSchedModel &SM) {
+  unsigned ProcResourceID = 0;
+
+  // Create a unique bitmask for every processor resource unit.
+  // Skip resource at index 0, since it always references 'InvalidUnit'.
+  ProcResID2Mask.resize(SM.getNumProcResourceKinds());
+  for (unsigned i = 1; i < SM.getNumProcResourceKinds(); ++i) {
+    const MCProcResourceDesc &Desc = *SM.getProcResource(i);
+    if (Desc.SubUnitsIdxBegin)
+      continue;
+    ProcResID2Mask[i] = 1ULL << ProcResourceID;
+    ProcResourceID++;
+  }
+
+  // Create a unique bitmask for every processor resource group.
+  for (unsigned i = 1; i < SM.getNumProcResourceKinds(); ++i) {
+    const MCProcResourceDesc &Desc = *SM.getProcResource(i);
+    if (!Desc.SubUnitsIdxBegin)
+      continue;
+    ProcResID2Mask[i] |= 1ULL << ProcResourceID;
+    for (unsigned U = 0; U < Desc.NumUnits; ++U) {
+      uint64_t OtherMask = ProcResID2Mask[Desc.SubUnitsIdxBegin[U]];
+      ProcResID2Mask[i] |= OtherMask;
+    }
+    ProcResourceID++;
+  }
+}
+
+// Returns the actual resource consumed by this Use.
+// First, is the primary resource ID.
+// Second, is the specific sub-resource ID.
+std::pair<uint64_t, uint64_t> ResourceManager::SelectPipe(uint64_t ResourceID) {
+  ResourceState &RS = *Resources[ResourceID];
+  uint64_t SubResourceID = RS.SelectNextInSequence();
+  if (RS.IsAResourceGroup())
+    return SelectPipe(SubResourceID);
+  return std::pair<uint64_t, uint64_t>(ResourceID, SubResourceID);
+}
+
+void ResourceState::RemoveFromNextInSequence(uint64_t ID) {
+  assert(NextInSequenceMask);
+  assert(countPopulation(ID) == 1);
+  if (ID > GetNextInSequence())
+    RemovedFromNextInSequence |= ID;
+  NextInSequenceMask = NextInSequenceMask & (~ID);
+  if (!NextInSequenceMask) {
+    NextInSequenceMask = ResourceSizeMask;
+    assert(NextInSequenceMask != RemovedFromNextInSequence);
+    NextInSequenceMask ^= RemovedFromNextInSequence;
+    RemovedFromNextInSequence = 0;
+  }
+}
+
+void ResourceManager::Use(ResourceRef RR) {
+  // Mark the sub-resource referenced by RR as used.
+  ResourceState &RS = *Resources[RR.first];
+  RS.MarkSubResourceAsUsed(RR.second);
+  // If there are still available units in RR.first,
+  // then we are done.
+  if (RS.IsReady())
+    return;
+
+  // Notify to other resources that RR.first is no longer available.
+  for (auto &Res : Resources) {
+    ResourceState &Current = *Res.second.get();
+    if (!Current.IsAResourceGroup() || Current.GetResourceMask() == RR.first)
+      continue;
+
+    if (Current.ContainsResource(RR.first)) {
+      Current.MarkSubResourceAsUsed(RR.first);
+      Current.RemoveFromNextInSequence(RR.first);
+    }
+  }
+}
+
+void ResourceManager::Release(ResourceRef RR) {
+  ResourceState &RS = *Resources[RR.first];
+  bool WasFullyUsed = !RS.IsReady();
+  RS.ReleaseSubResource(RR.second);
+  if (!WasFullyUsed)
+    return;
+
+  for (auto &Res : Resources) {
+    ResourceState &Current = *Res.second.get();
+    if (!Current.IsAResourceGroup() || Current.GetResourceMask() == RR.first)
+      continue;
+
+    if (Current.ContainsResource(RR.first))
+      Current.ReleaseSubResource(RR.first);
+  }
+}
+
+void ResourceManager::ReserveDispatchHazardResources(const InstrDesc &Desc) {
+  for (const auto &R : Desc.Buffers) {
+    ResourceState &Resource = *Resources[R];
+    if (Resource.IsADispatchHazard()) {
+      assert(!Resource.IsReserved());
+      Resource.SetReserved();
+    }
+  }
+}
+
+bool ResourceManager::CanBeIssued(const InstrDesc &Desc) const {
+  return std::all_of(Desc.Resources.begin(), Desc.Resources.end(),
+                     [&](const std::pair<uint64_t, const ResourceUsage> &E) {
+                       unsigned NumUnits =
+                           E.second.IsReserved() ? 0U : E.second.NumUnits;
+                       return IsReady(E.first, NumUnits);
+                     });
+}
+
+// Returns true if all resources are in-order, and there is at least one
+// resource which is a dispatch hazard (BufferSize = 0).
+bool ResourceManager::MustIssueImmediately(const InstrDesc &Desc) {
+  if (!CanBeIssued(Desc))
+    return false;
+  bool AllInOrderResources =
+    std::all_of(Desc.Buffers.begin(), Desc.Buffers.end(),
+                [&](const unsigned BufferMask) {
+                    const ResourceState &Resource = *Resources[BufferMask];
+                    return Resource.IsInOrder() || Resource.IsADispatchHazard();
+                });
+  if (!AllInOrderResources)
+    return false;
+
+  return std::any_of(Desc.Buffers.begin(), Desc.Buffers.end(),
+                [&](const unsigned BufferMask) {
+                    return Resources[BufferMask]->IsADispatchHazard();
+                });
+}
+
+double ResourceManager::GetRThroughput(const InstrDesc &ID) const {
+  double RThroughput = 0;
+  for (const auto &Usage : ID.Resources) {
+    const CycleSegment &CS = Usage.second.CS;
+    assert(CS.begin() == 0);
+
+    if (Usage.second.IsReserved()) {
+      RThroughput = std::max(RThroughput, (double)CS.size());
+      continue;
+    }
+
+    unsigned Population = std::max(1U, countPopulation(Usage.first) - 1);
+    unsigned NumUnits = Population * GetNumUnits(Usage.first);
+    NumUnits -= Usage.second.NumUnits - 1;
+    unsigned Cycles = CS.size();
+    RThroughput = std::max(RThroughput, (double)Cycles / NumUnits);
+  }
+  return RThroughput;
+}
+
+void ResourceManager::IssueInstruction(
+    unsigned Index, const InstrDesc &Desc,
+    SmallVector<std::pair<ResourceRef, unsigned>, 4> &Pipes) {
+  ReleaseBuffers(Desc);
+  for (const auto &R : Desc.Resources) {
+    const CycleSegment &CS = R.second.CS;
+    if (!CS.size()) {
+      ReleaseResource(R.first);
+      continue;
+    }
+
+    assert(CS.begin() == 0 && "Invalid {Start, End} cycles!");
+    if (!R.second.IsReserved()) {
+      auto Pipe = SelectPipe(R.first);
+      Use(Pipe);
+      BusyResources[Pipe] += CS.size();
+      Pipes.emplace_back(std::pair<ResourceRef, unsigned>(Pipe, CS.size()));
+    } else {
+      // Mark this group as reserved.
+      assert(R.second.IsReserved());
+      assert((countPopulation(R.first) > 1) && "Expected a group!");
+      ReserveResource(R.first);
+      BusyResources[ResourceRef(R.first, R.first)] += CS.size();
+    }
+  }
+}
+
+void ResourceManager::CycleEvent(SmallVector<ResourceRef, 8> &ResourcesFreed) {
+  for (auto &BR : BusyResources) {
+    if (BR.second)
+      BR.second--;
+    if (!BR.second) {
+      // Release this resource.
+      const ResourceRef &RR = BR.first;
+
+      if (countPopulation(RR.first) == 1)
+        Release(RR);
+
+      ReleaseResource(RR.first);
+      ResourcesFreed.push_back(RR);
+    }
+  }
+
+  for (const auto &RF : ResourcesFreed)
+    BusyResources.erase(RF);
+}
+
+Instruction *Scheduler::ScheduleInstruction(unsigned Idx, Instruction *MCIS) {
+  assert(WaitQueue.find(Idx) == WaitQueue.end());
+  assert(ReadyQueue.find(Idx) == ReadyQueue.end());
+  assert(IssuedQueue.find(Idx) == IssuedQueue.end());
+
+  // Special case where MCIS is a zero-latency instruction.  A zero-latency
+  // instruction doesn't consume any scheduler resources.  That is because it
+  // doesn't need to be executed.  Most of the times, zero latency instructions
+  // are removed at register renaming stage. For example, register-register
+  // moves can be removed at register renaming stage by creating new aliases.
+  // Zero-idiom instruction (for example: a `xor reg, reg`) can also be
+  // eliminated at register renaming stage, since we know in advance that those
+  // clear their output register.
+  if (MCIS->IsZeroLatency()) {
+    MCIS->ForceExecuted();
+    NotifyInstructionIssued(Idx, {});
+    NotifyInstructionExecuted(Idx);
+    return MCIS;
+  }
+
+  const InstrDesc &Desc = MCIS->GetDesc();
+
+  // Consume entries in the reservation stations.
+  Resources->ReserveBuffers(Desc);
+
+  // Mark units with BufferSize=0 as reserved. These resources will only
+  // be released after MCIS is issued, and all the ResourceCycles for
+  // those units have been consumed.
+  Resources->ReserveDispatchHazardResources(Desc);
+
+  bool MayLoad = Desc.MayLoad;
+  bool MayStore = Desc.MayStore;
+  if (MayLoad || MayStore)
+    LSU->Reserve(Idx, MayLoad, MayStore, Desc.HasSideEffects);
+
+  MCIS->Dispatch();
+  bool IsReady = MCIS->IsReady();
+  if (IsReady && (MayLoad || MayStore))
+    IsReady &= LSU->IsReady(Idx);
+
+  if (!IsReady) {
+    DEBUG(dbgs() << "[SCHEDULER] Adding " << Idx << " to the Wait Queue\n");
+    WaitQueue[Idx] = MCIS;
+    return MCIS;
+  }
+  NotifyInstructionReady(Idx);
+
+  // Special case where the instruction is ready, and it uses an in-order dispatch/issue
+  // processor resource. The instruction is issued immediately to the pipelines.
+  // Any other in-order buffered resources (i.e. BufferSize=1) are consumed.
+  if (Resources->MustIssueImmediately(Desc)) {
+    DEBUG(dbgs() << "[SCHEDULER] Instruction " << Idx << " issued immediately\n");
+    IssueInstruction(MCIS, Idx);
+    return MCIS;
+  }
+
+  DEBUG(dbgs() << "[SCHEDULER] Adding " << Idx << " to the Ready Queue\n");
+  ReadyQueue[Idx] = MCIS;
+  return MCIS;
+}
+
+void Scheduler::CycleEvent(unsigned /* unused */) {
+  SmallVector<ResourceRef, 8> ResourcesFreed;
+  Resources->CycleEvent(ResourcesFreed);
+
+  for (const ResourceRef &RR : ResourcesFreed)
+    NotifyResourceAvailable(RR);
+
+  UpdateIssuedQueue();
+  UpdatePendingQueue();
+  Issue();
+}
+
+#ifndef NDEBUG
+void Scheduler::dump() const {
+  dbgs() << "[SCHEDULER]: WaitQueue size is: " << WaitQueue.size() << '\n';
+  dbgs() << "[SCHEDULER]: ReadyQueue size is: " << ReadyQueue.size() << '\n';
+  dbgs() << "[SCHEDULER]: IssuedQueue size is: " << IssuedQueue.size() << '\n';
+  Resources->dump();
+}
+#endif
+
+Scheduler::Event Scheduler::CanBeDispatched(const InstrDesc &Desc) const {
+  if (Desc.MayLoad && LSU->IsLQFull())
+    return HWS_LD_QUEUE_UNAVAILABLE;
+  if (Desc.MayStore && LSU->IsSQFull())
+    return HWS_ST_QUEUE_UNAVAILABLE;
+
+  Scheduler::Event Event;
+  switch (Resources->CanBeDispatched(Desc)) {
+  case ResourceStateEvent::RS_BUFFER_AVAILABLE:
+    Event = HWS_AVAILABLE;
+    break;
+  case ResourceStateEvent::RS_BUFFER_UNAVAILABLE:
+    Event = HWS_QUEUE_UNAVAILABLE;
+    break;
+  case ResourceStateEvent::RS_RESERVED:
+    Event = HWS_DISPATCH_GROUP_RESTRICTION;
+  }
+  return Event;
+}
+
+void Scheduler::IssueInstruction(Instruction *IS, unsigned InstrIndex) {
+  // Issue the instruction and collect all the consumed resources
+  // into a vector. That vector is then used to notify the listener.
+  // Most instructions consume very few resurces (typically one or
+  // two resources). We use a small vector here, and conservatively
+  // initialize its capacity to 4. This should address the majority of
+  // the cases.
+  SmallVector<std::pair<ResourceRef, unsigned>, 4> UsedResources;
+
+  const InstrDesc &D = IS->GetDesc();
+  Resources->IssueInstruction(InstrIndex, D, UsedResources);
+  // Notify the instruction that it started executing.
+  // This updates the internal state of each write.
+  IS->Execute();
+
+  if (D.MaxLatency) {
+    IssuedQueue[InstrIndex] = IS;
+    NotifyInstructionIssued(InstrIndex, UsedResources);
+  } else {
+    // A zero latency instruction which reads and/or updates registers.
+    NotifyInstructionIssued(InstrIndex, UsedResources);
+    NotifyInstructionExecuted(InstrIndex);
+  }
+}
+
+void Scheduler::Issue() {
+  std::vector<unsigned> ToRemove;
+  for (auto QueueEntry : ReadyQueue) {
+    // Give priority to older instructions in ReadyQueue. The ready queue is
+    // ordered by key, and therefore older instructions are visited first.
+    Instruction *IS = QueueEntry.second;
+    const InstrDesc &D = IS->GetDesc();
+    if (!Resources->CanBeIssued(D))
+      continue;
+    unsigned InstrIndex = QueueEntry.first;
+    IssueInstruction(IS, InstrIndex);
+    ToRemove.emplace_back(InstrIndex);
+  }
+
+  for (const unsigned InstrIndex : ToRemove)
+    ReadyQueue.erase(InstrIndex);
+}
+
+void Scheduler::UpdatePendingQueue() {
+  // Scan the set of waiting instructions and promote them to the
+  // ready queue if operands are all ready.
+  for (auto I = WaitQueue.begin(), E = WaitQueue.end(); I != E;) {
+    I->second->CycleEvent();
+
+    const InstrDesc &Desc = I->second->GetDesc();
+    bool IsMemOp = Desc.MayLoad || Desc.MayStore;
+    bool IsReady = I->second->IsReady();
+    if (IsReady && IsMemOp)
+      IsReady &= LSU->IsReady(I->first);
+
+    if (IsReady) {
+      NotifyInstructionReady(I->first);
+      ReadyQueue[I->first] = I->second;
+      auto ToRemove = I;
+      ++I;
+      WaitQueue.erase(ToRemove);
+    } else {
+      ++I;
+    }
+  }
+}
+
+void Scheduler::UpdateIssuedQueue() {
+  for (auto I = IssuedQueue.begin(), E = IssuedQueue.end(); I != E;) {
+    I->second->CycleEvent();
+    if (I->second->IsExecuted()) {
+      NotifyInstructionExecuted(I->first);
+      auto ToRemove = I;
+      ++I;
+      IssuedQueue.erase(ToRemove);
+    } else {
+      DEBUG(dbgs() << "[SCHEDULER]: Instruction " << I->first
+                   << " is still executing.\n");
+      ++I;
+    }
+  }
+}
+
+void Scheduler::NotifyInstructionIssued(
+    unsigned Index, const ArrayRef<std::pair<ResourceRef, unsigned>> &Used) {
+  Owner->NotifyInstructionIssued(Index, Used);
+}
+
+void Scheduler::NotifyInstructionExecuted(unsigned Index) {
+  LSU->OnInstructionExecuted(Index);
+  Owner->NotifyInstructionExecuted(Index);
+}
+
+void Scheduler::NotifyInstructionReady(unsigned Index) {
+  Owner->NotifyInstructionReady(Index);
+}
+
+void Scheduler::NotifyResourceAvailable(const ResourceRef &RR) {
+  Owner->NotifyResourceAvailable(RR);
+}
+} // namespace mca
Index: tools/llvm-mca/SourceMgr.h
===================================================================
--- tools/llvm-mca/SourceMgr.h
+++ tools/llvm-mca/SourceMgr.h
@@ -0,0 +1,65 @@
+//===--------------------- SourceMgr.h --------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements class SourceMgr. Class SourceMgr abstracts the input
+/// code sequence (a sequence of MCInst), and assings unique identifiers to
+/// every instruction in the sequence.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_SOURCEMGR_H
+#define LLVM_TOOLS_LLVM_MCA_SOURCEMGR_H
+
+#include "llvm/MC/MCInst.h"
+#include <vector>
+
+namespace mca {
+
+using namespace llvm;
+
+typedef std::pair<unsigned, const MCInst *> InstRef;
+
+class SourceMgr {
+  using InstVec = std::vector<std::unique_ptr<const MCInst>>;
+  InstVec Sequence;
+  unsigned Current;
+  unsigned Iterations;
+  static const unsigned DefaultIterations = 70;
+public:
+  SourceMgr(unsigned NumIterations)
+      : Current(0), Iterations(NumIterations ? NumIterations : DefaultIterations) {}
+
+  unsigned GetCurrentIteration() const { return Current / Sequence.size(); }
+  unsigned GetNumIterations() const { return Iterations; }
+  unsigned size() const { return Sequence.size(); }
+  const InstVec &GetSequence() const { return Sequence; }
+  InstVec &GetSequence() { return Sequence; }
+
+  bool HasNext() { return Current < (Iterations * size()); }
+  void UpdateNext() { Current++; }
+
+  const InstRef PeekNext() const {
+    unsigned Index = GetCurrentInstructionIndex();
+    return InstRef(Current, Sequence[Index].get());
+  }
+
+  unsigned GetCurrentInstructionIndex() const {
+    return Current % Sequence.size();
+  }
+
+  const MCInst &GetMCInstFromIndex(unsigned Index) const {
+    return *Sequence[Index % size()];
+  }
+
+  bool IsEmpty() const { return size() == 0; }
+};
+
+} // namespace mca
+
+#endif
Index: tools/llvm-mca/TimelineView.h
===================================================================
--- tools/llvm-mca/TimelineView.h
+++ tools/llvm-mca/TimelineView.h
@@ -0,0 +1,184 @@
+//===--------------------- TimelineView.h ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \brief
+///
+/// This file implements a timeline view for the llvm-mca tool.
+///
+/// Class TimelineView observes events generated by the backend. For every
+/// instruction executed by the backend, it stores information related to
+/// state transition. It then plots that information in the form of a table
+/// as reported by the example below:
+///
+/// Timeline view:
+///     	          0123456
+/// Index	0123456789       
+///
+/// [0,0]	DeER .    .    ..	vmovshdup  %xmm0, %xmm1
+/// [0,1]	DeER .    .    ..	vpermilpd  $1, %xmm0, %xmm2
+/// [0,2]	.DeER.    .    ..	vpermilps  $231, %xmm0, %xmm5
+/// [0,3]	.DeeeER   .    ..	vaddss  %xmm1, %xmm0, %xmm3
+/// [0,4]	. D==eeeER.    ..	vaddss  %xmm3, %xmm2, %xmm4
+/// [0,5]	. D=====eeeER  ..	vaddss  %xmm4, %xmm5, %xmm6
+///
+/// [1,0]	.  DeE------R  ..	vmovshdup  %xmm0, %xmm1
+/// [1,1]	.  DeE------R  ..	vpermilpd  $1, %xmm0, %xmm2
+/// [1,2]	.   DeE-----R  ..	vpermilps  $231, %xmm0, %xmm5
+/// [1,3]	.   D=eeeE--R  ..	vaddss  %xmm1, %xmm0, %xmm3
+/// [1,4]	.    D===eeeER ..	vaddss  %xmm3, %xmm2, %xmm4
+/// [1,5]	.    D======eeeER	vaddss  %xmm4, %xmm5, %xmm6
+///
+/// There is an entry for every instruction in the input assembly sequence.
+/// The first field is a pair of numbers obtained from the instruction index.
+/// The first element of the pair is the iteration index, while the second
+/// element of the pair is a sequence number (i.e. a position in the assembly
+/// sequence).
+/// The second field of the table is the actual timeline information; each
+/// column is the information related to a specific cycle of execution.
+/// The timeline of an instruction is described by a sequence of character
+/// where each character represents the instruction state at a specific cycle.
+///
+/// Possible instruction states are:
+///  D: Instruction Dispatched
+///  e: Instruction Executing
+///  E: Instruction Executed (write-back stage)
+///  R: Instruction retired
+///  =: Instruction waiting in the Scheduler's queue
+///  -: Instruction executed, waiting to retire in order.
+///
+/// dots ('.') and empty spaces are cycles where the instruction is not
+/// in-flight.
+///
+/// The last column is the assembly instruction associated to the entry.
+///
+/// Based on the timeline view information from the example, instruction 0
+/// at iteration 0 was dispatched at cycle 0, and was retired at cycle 3.
+/// Instruction [0,1] was also dispatched at cycle 0, and it retired at
+/// the same cycle than instruction [0,0].
+/// Instruction [0,4] has been dispatched at cycle 2. However, it had to
+/// wait for two cycles before being issued. That is because operands
+/// became ready only at cycle 5.
+///
+/// This view helps further understanding bottlenecks and the impact of
+/// resource pressure on the code.
+///
+/// To better understand why instructions had to wait for multiple cycles in
+/// the scheduler's queue, class TimelineView also reports extra timing info
+/// in another table named "Average Wait times" (see example below).
+///
+///
+/// Average Wait times (based on the timeline view):
+/// [0]: Executions
+/// [1]: Average time spent waiting in a scheduler's queue
+/// [2]: Average time spent waiting in a scheduler's queue while ready
+/// [3]: Average time elapsed from WB until retire stage
+///
+///	[0]	[1]	[2]	[3]
+/// 0.	 2	1.0	1.0	3.0	vmovshdup  %xmm0, %xmm1
+/// 1.	 2	1.0	1.0	3.0	vpermilpd  $1, %xmm0, %xmm2
+/// 2.	 2	1.0	1.0	2.5	vpermilps  $231, %xmm0, %xmm5
+/// 3.	 2	1.5	0.5	1.0	vaddss  %xmm1, %xmm0, %xmm3
+/// 4.	 2	3.5	0.0	0.0	vaddss  %xmm3, %xmm2, %xmm4
+/// 5.	 2	6.5	0.0	0.0	vaddss  %xmm4, %xmm5, %xmm6
+///
+/// By comparing column [2] with column [1], we get an idea about how many
+/// cycles were spent in the scheduler's queue due to data dependencies.
+///
+/// In this example, instruction 5 spent an average of ~6 cycles in the
+/// scheduler's queue. As soon as operands became ready, the instruction
+/// was immediately issued to the pipeline(s).
+/// That is expected because instruction 5 cannot transition to the "ready"
+/// state until %xmm4 is written by instruction 4.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_TIMELINEVIEW_H
+#define LLVM_TOOLS_LLVM_MCA_TIMELINEVIEW_H
+
+#include "HWEventListener.h"
+#include "SourceMgr.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+
+
+namespace mca {
+
+using namespace llvm;
+
+/// \brief This class listens to instruction state transition events
+/// in order to construct a timeline information.
+///
+/// For every instruction executed by the Backend, this class constructs
+/// a TimelineViewEntry object. TimelineViewEntry objects are then used
+/// to print the timeline information, as well as the "average wait times"
+/// for every instruction in the input assembly sequence.
+class TimelineView : public HWEventListener {
+  const MCSubtargetInfo &STI;
+  MCInstPrinter &MCIP;
+  const SourceMgr &AsmSequence;
+
+  unsigned CurrentCycle;
+  unsigned MaxCycle;
+  unsigned LastCycle;
+
+  struct TimelineViewEntry {
+    unsigned CycleDispatched;
+    unsigned CycleReady;
+    unsigned CycleIssued;
+    unsigned CycleExecuted;
+    unsigned CycleRetired;
+  };
+  std::vector<TimelineViewEntry> Timeline;
+
+  struct WaitTimeEntry {
+    unsigned Executions;
+    unsigned CyclesSpentInSchedulerQueue;
+    unsigned CyclesSpentInSQWhileReady;
+    unsigned CyclesSpentAfterWBAndBeforeRetire;
+  };
+  std::vector<WaitTimeEntry> WaitTime;
+
+  void printTimelineViewEntry(raw_string_ostream &OS,
+                              const TimelineViewEntry &E, unsigned Iteration,
+                              unsigned SourceIndex) const;
+  void printWaitTimeEntry(raw_string_ostream &OS, const WaitTimeEntry &E,
+                          unsigned Index) const;
+
+  const unsigned DEFAULT_ITERATIONS = 10;
+
+public:
+  TimelineView(const MCSubtargetInfo &sti, MCInstPrinter &Printer,
+               const SourceMgr &Sequence, unsigned MaxIterations,
+               unsigned Cycles)
+      : STI(sti), MCIP(Printer), AsmSequence(Sequence), CurrentCycle(0),
+        MaxCycle(Cycles == 0 ? 80 : Cycles), LastCycle(0) {
+    initialize(MaxIterations);
+  }
+
+  void initialize(unsigned MaxIterations);
+
+  // Event handlers.
+  void OnInstructionDispatched(unsigned Index) override;
+  void OnInstructionReady(unsigned Index) override;
+  void OnInstructionIssued(
+      unsigned Index,
+      const ArrayRef<std::pair<ResourceRef, unsigned>> &Used) override;
+  void OnInstructionExecuted(unsigned Index) override;
+  void OnInstructionRetired(unsigned Index) override;
+  void OnCycleBegin(unsigned Cycle) override { CurrentCycle = Cycle; }
+
+  // print functionalities.
+  void printTimeline(raw_ostream &OS) const;
+  void printAverageWaitTimes(raw_ostream &OS) const;
+};
+
+} // namespace mca
+
+#endif
Index: tools/llvm-mca/TimelineView.cpp
===================================================================
--- tools/llvm-mca/TimelineView.cpp
+++ tools/llvm-mca/TimelineView.cpp
@@ -0,0 +1,247 @@
+//===--------------------- TimelineView.cpp ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \brief
+///
+/// This file implements the TimelineView interface.
+///
+//===----------------------------------------------------------------------===//
+
+#include "TimelineView.h"
+
+using namespace llvm;
+
+namespace mca {
+
+void TimelineView::initialize(unsigned MaxIterations) {
+  unsigned NumInstructions =
+      AsmSequence.GetNumIterations() * AsmSequence.size();
+  if (!MaxIterations)
+    MaxIterations = DEFAULT_ITERATIONS;
+  unsigned NumEntries =
+      std::min(NumInstructions, MaxIterations * AsmSequence.size());
+  Timeline.resize(NumEntries);
+  TimelineViewEntry NullTVEntry = {0, 0, 0, 0, 0};
+  std::fill(Timeline.begin(), Timeline.end(), NullTVEntry);
+
+  WaitTime.resize(AsmSequence.size());
+  WaitTimeEntry NullWTEntry = {0, 0, 0, 0};
+  std::fill(WaitTime.begin(), WaitTime.end(), NullWTEntry);
+}
+
+void TimelineView::OnInstructionDispatched(unsigned Index) {
+  if (CurrentCycle >= MaxCycle || Index >= Timeline.size())
+    return;
+  Timeline[Index].CycleDispatched = CurrentCycle;
+  LastCycle = std::max(LastCycle, CurrentCycle);
+}
+
+void TimelineView::OnInstructionReady(unsigned Index) {
+  if (CurrentCycle >= MaxCycle || Index >= Timeline.size())
+    return;
+  Timeline[Index].CycleReady = CurrentCycle;
+  LastCycle = std::max(LastCycle, CurrentCycle);
+}
+
+void TimelineView::OnInstructionIssued(
+    unsigned Index,
+    const ArrayRef<std::pair<ResourceRef, unsigned>> & /* Unused */) {
+  if (CurrentCycle >= MaxCycle || Index >= Timeline.size())
+    return;
+  Timeline[Index].CycleIssued = CurrentCycle;
+  LastCycle = std::max(LastCycle, CurrentCycle);
+}
+
+void TimelineView::OnInstructionExecuted(unsigned Index) {
+  if (CurrentCycle >= MaxCycle || Index >= Timeline.size())
+    return;
+  Timeline[Index].CycleExecuted = CurrentCycle;
+  LastCycle = std::max(LastCycle, CurrentCycle);
+}
+
+void TimelineView::OnInstructionRetired(unsigned Index) {
+  if (CurrentCycle >= MaxCycle || Index >= Timeline.size())
+    return;
+  TimelineViewEntry &TVEntry = Timeline[Index];
+  TVEntry.CycleRetired = CurrentCycle;
+  LastCycle = std::max(LastCycle, CurrentCycle);
+
+  // Update the WaitTime entry which corresponds to this Index.
+
+  WaitTimeEntry &WTEntry = WaitTime[Index % AsmSequence.size()];
+  WTEntry.Executions++;
+  WTEntry.CyclesSpentInSchedulerQueue +=
+      TVEntry.CycleIssued - TVEntry.CycleDispatched;
+  assert(TVEntry.CycleDispatched <= TVEntry.CycleReady);
+  WTEntry.CyclesSpentInSQWhileReady += TVEntry.CycleIssued - TVEntry.CycleReady;
+  WTEntry.CyclesSpentAfterWBAndBeforeRetire +=
+      (TVEntry.CycleRetired - 1) - TVEntry.CycleExecuted;
+}
+
+void TimelineView::printWaitTimeEntry(raw_string_ostream &OS,
+                                      const WaitTimeEntry &Entry,
+                                      unsigned SourceIndex) const {
+  OS << SourceIndex << '.';
+  if (SourceIndex < 10)
+    OS << "    ";
+  else if (SourceIndex < 100)
+    OS << "   ";
+  else if (SourceIndex < 1000)
+    OS << "  ";
+  else
+    OS << ' ';
+
+  if (Entry.Executions == 0) {
+    OS << " -      -      -      -   ";
+  } else {
+    double AverageTime1, AverageTime2, AverageTime3;
+    unsigned Executions = Entry.Executions;
+    AverageTime1 = (double)Entry.CyclesSpentInSchedulerQueue / Executions;
+    AverageTime2 = (double)Entry.CyclesSpentInSQWhileReady / Executions;
+    AverageTime3 = (double)Entry.CyclesSpentAfterWBAndBeforeRetire / Executions;
+    if (Executions < 10)
+      OS << ' ' << Executions << "     ";
+    else if (Executions < 100)
+      OS << ' ' << Executions << "    ";
+    else
+      OS << Executions << "   ";
+
+
+    OS << format("%.1f", AverageTime1);
+    if (AverageTime1 < 10.0)
+      OS << "    ";
+    else if (AverageTime1 < 100.0)
+      OS << "   ";
+    else
+      OS << "  ";
+
+    OS << format("%.1f", AverageTime2);
+    if (AverageTime2 < 10.0)
+      OS << "    ";
+    else if (AverageTime2 < 100.0)
+      OS << "   ";
+    else
+      OS << "  ";
+
+    OS << format("%.1f", AverageTime3);
+    if (AverageTime3 < 10.0)
+      OS << "  ";
+    else if (AverageTime3 < 100.0)
+      OS << ' ';
+  }
+}
+
+void TimelineView::printAverageWaitTimes(raw_ostream &OS) const {
+  if (WaitTime.empty())
+    return;
+
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+
+  TempStream
+      << "\n\nAverage Wait times (based on the timeline view):\n"
+      << "[0]: Executions\n"
+      << "[1]: Average time spent waiting in a scheduler's queue\n"
+      << "[2]: Average time spent waiting in a scheduler's queue while ready\n"
+      << "[3]: Average time elapsed from WB until retire stage\n\n";
+  TempStream << "      [0]    [1]    [2]    [3]\n";
+
+  for (unsigned i = 0; i < WaitTime.size(); ++i) {
+    printWaitTimeEntry(TempStream, WaitTime[i], i);
+    // Append the instruction info at the end of the line.
+    const MCInst &Inst = AsmSequence.GetMCInstFromIndex(i);
+    MCIP.printInst(&Inst, TempStream, "", STI);
+    TempStream << '\n';
+    TempStream.flush();
+    OS << Buffer;
+    Buffer = "";
+  }
+}
+
+void TimelineView::printTimelineViewEntry(raw_string_ostream &OS,
+                                          const TimelineViewEntry &E,
+                                          unsigned Iteration,
+                                          unsigned SourceIndex) const {
+  if (SourceIndex == 0)
+    OS << '\n';
+  OS << '[' << Iteration << ',' << SourceIndex << "]\t";
+  for (unsigned i = 0; i < E.CycleDispatched; ++i)
+    OS << ((i % 5 == 0) ? '.' : ' ');
+  OS << 'D';
+  if (E.CycleDispatched != E.CycleExecuted) {
+    // Zero latency instructions have the same value for CycleDispatched,
+    // CycleIssued and CycleExecuted.
+    for (unsigned i = E.CycleDispatched + 1; i < E.CycleIssued; ++i)
+      OS << '=';
+    if (E.CycleIssued == E.CycleExecuted)
+      OS << 'E';
+    else {
+      if (E.CycleDispatched != E.CycleIssued)
+        OS << 'e';
+      for (unsigned i = E.CycleIssued + 1; i < E.CycleExecuted; ++i)
+        OS << 'e';
+      OS << 'E';
+    }
+  }
+
+  for (unsigned i = E.CycleExecuted + 1; i < E.CycleRetired; ++i)
+    OS << '-';
+  OS << 'R';
+
+  // Skip other columns.
+  for (unsigned i = E.CycleRetired + 1; i <= LastCycle; ++i)
+    OS << ((i % 5 == 0 || i == LastCycle) ? '.' : ' ');
+}
+
+static void printTimelineHeader(raw_string_ostream &OS, unsigned Cycles) {
+  OS << "\n\nTimeline view:\n";
+  OS << "     \t";
+  for (unsigned i = 0; i <= Cycles; ++i) {
+    if (((i / 10) & 1) == 0)
+      OS << ' ';
+    else
+      OS << i % 10;
+  }
+
+  OS << "\nIndex\t";
+  for (unsigned i = 0; i <= Cycles; ++i) {
+    if (((i / 10) & 1) == 0)
+      OS << i % 10;
+    else
+      OS << ' ';
+  }
+  OS << '\n';
+}
+
+void TimelineView::printTimeline(raw_ostream &OS) const {
+  std::string Buffer;
+  raw_string_ostream TempStream(Buffer);
+
+  printTimelineHeader(TempStream, LastCycle);
+  TempStream.flush();
+  OS << Buffer;
+
+  for (unsigned Idx = 0; Idx < Timeline.size(); ++Idx) {
+    Buffer = "";
+    const TimelineViewEntry &E = Timeline[Idx];
+    if (E.CycleRetired == 0)
+      return;
+
+    unsigned Iteration = Idx / AsmSequence.size();
+    unsigned SourceIndex = Idx % AsmSequence.size();
+    printTimelineViewEntry(TempStream, E, Iteration, SourceIndex);
+    // Append the instruction info at the end of the line.
+    const MCInst &Inst = AsmSequence.GetMCInstFromIndex(Idx);
+    MCIP.printInst(&Inst, TempStream, "", STI);
+    TempStream << '\n';
+    TempStream.flush();
+    OS << Buffer;
+  }
+}
+
+} // namespace mca
Index: tools/llvm-mca/llvm-mca.cpp
===================================================================
--- tools/llvm-mca/llvm-mca.cpp
+++ tools/llvm-mca/llvm-mca.cpp
@@ -0,0 +1,328 @@
+//===-- llvm-mca.cpp - Machine Code Analyzer -------------------*- C++ -* -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This utility is a simple driver that allows static performance analysis on
+// machine code similarly to how IACA (Intel Architecture Code Analyzer) works.
+//
+//   llvm-mca [options] <file-name>
+//      -march <type>
+//      -mcpu <cpu>
+//      -o <file>
+//
+// The target defaults to the host target.
+// The cpu defaults to 'generic'.
+// The output defaults to standard output.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BackendPrinter.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+
+using namespace llvm;
+
+static cl::opt<std::string>
+    InputFilename(cl::Positional, cl::desc("<input file>"), cl::init("-"));
+
+static cl::opt<std::string> OutputFilename("o", cl::desc("Output filename"),
+                                           cl::init("-"),
+                                           cl::value_desc("filename"));
+
+static cl::opt<std::string>
+    ArchName("march", cl::desc("Target arch to assemble for, "
+                               "see -version for available targets"));
+
+static cl::opt<std::string>
+    TripleName("mtriple", cl::desc("Target triple to assemble for, "
+                                  "see -version for available targets"));
+
+static cl::opt<std::string>
+    MCPU("mcpu",
+         cl::desc("Target a specific cpu type (-mcpu=help for details)"),
+         cl::value_desc("cpu-name"), cl::init("generic"));
+
+static cl::list<std::string>
+    MAttrs("mattr", cl::CommaSeparated,
+           cl::desc("Target specific attributes (-mattr=help for details)"),
+           cl::value_desc("a1,+a2,-a3,..."));
+
+static cl::opt<std::string> MainFileName(
+    "main-file-name",
+    cl::desc("Specifies the name we should consider the input file"));
+
+static cl::opt<unsigned>
+    OutputAsmVariant("output-asm-variant",
+                     cl::desc("Syntax variant to use for output printing"));
+
+static cl::opt<unsigned> Iterations("iterations",
+                                    cl::desc("Number of iterations to run"),
+                                    cl::init(0));
+
+static cl::opt<unsigned> DispatchWidth(
+    "dispatch",
+    cl::desc("Dispatch Width. By default it is set equal to IssueWidth"),
+    cl::init(0));
+
+static cl::opt<unsigned> MaxRetirePerCycle(
+    "max-retire-per-cycle",
+    cl::desc("Maximum number of instructions that can be retired in one cycle"),
+    cl::init(0));
+
+static cl::opt<unsigned>
+    RegisterFileSize("register-file-size",
+                     cl::desc("Maximum number of temporary registers which can "
+                              "be used for register mappings"),
+                     cl::init(0));
+
+static cl::opt<bool> PrintTimelineView("timeline",
+                                       cl::desc("Print the timeline view"),
+                                       cl::init(false));
+
+static cl::opt<unsigned> TimelineMaxIterations(
+    "timeline-max-iterations",
+    cl::desc("Maximum number of iterations to print in timeline view."),
+    cl::init(0));
+
+static cl::opt<unsigned> TimelineMaxCycles(
+    "timeline-max-cycles",
+    cl::desc(
+        "Maximum number of cycles in the timeline view. Defaults to 80 cycles"),
+    cl::init(80));
+
+static cl::opt<bool> PrintModeVerbose("verbose",
+                                      cl::desc("Enable verbose output"),
+                                      cl::init(false));
+
+static cl::opt<bool>
+    AssumeNoAlias("noalias",
+                  cl::desc("Assume that there is no memory aliasing"),
+                  cl::init(false));
+
+static cl::opt<unsigned>
+    LoadQueueSize("lqueue", cl::desc("Size of the load queue"), cl::init(0));
+static cl::opt<unsigned>
+    StoreQueueSize("squeue", cl::desc("Size of the store queue"), cl::init(0));
+
+static const Target *GetTarget(const char *ProgName) {
+  TripleName = Triple::normalize(TripleName);
+  if (TripleName.empty())
+    TripleName = Triple::normalize(sys::getDefaultTargetTriple());
+  Triple TheTriple(TripleName);
+
+  // Get the target specific parser.
+  std::string Error;
+  const Target *TheTarget =
+      TargetRegistry::lookupTarget(ArchName, TheTriple, Error);
+  if (!TheTarget) {
+    errs() << ProgName << ": " << Error;
+    return nullptr;
+  }
+
+  // Return the found target.
+  return TheTarget;
+}
+
+static int AssembleInput(const char *ProgName, const Target *TheTarget,
+                         SourceMgr &SrcMgr, MCContext &Ctx, MCStreamer &Str,
+                         MCAsmInfo &MAI, MCSubtargetInfo &STI,
+                         MCInstrInfo &MCII, MCTargetOptions &MCOptions) {
+  std::unique_ptr<MCAsmParser> Parser(createMCAsmParser(SrcMgr, Ctx, Str, MAI));
+  std::unique_ptr<MCTargetAsmParser> TAP(
+      TheTarget->createMCAsmParser(STI, *Parser, MCII, MCOptions));
+
+  if (!TAP) {
+    errs() << ProgName
+           << ": error: this target does not support assembly parsing.\n";
+    return 1;
+  }
+
+  Parser->setTargetParser(*TAP);
+  return Parser->Run(false);
+}
+
+namespace {
+
+class MCStreamerWrapper final : public MCStreamer {
+  using InstVec = std::vector<std::unique_ptr<const MCInst>>;
+  InstVec &Insts;
+
+public:
+  MCStreamerWrapper(MCContext &Context, InstVec &Vec)
+      : MCStreamer(Context), Insts(Vec) {}
+
+  // We only want to intercept the emission of new instructions.
+  virtual void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                               bool /* unused */) override {
+    Insts.emplace_back(new MCInst(Inst));
+  }
+
+  bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override {
+    return true;
+  }
+
+  void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                        unsigned ByteAlignment) override {}
+  void EmitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr,
+                    uint64_t Size = 0, unsigned ByteAlignment = 0) override {}
+  void EmitGPRel32Value(const MCExpr *Value) override {}
+  void BeginCOFFSymbolDef(const MCSymbol *Symbol) override {}
+  void EmitCOFFSymbolStorageClass(int StorageClass) override {}
+  void EmitCOFFSymbolType(int Type) override {}
+  void EndCOFFSymbolDef() override {}
+
+  const InstVec &GetInstructionSequence() const { return Insts; }
+};
+
+} // end of anonymous namespace
+
+int main(int argc, char **argv) {
+  sys::PrintStackTraceOnErrorSignal(argv[0]);
+  PrettyStackTraceProgram X(argc, argv);
+  llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
+
+  // Initialize targets and assembly parsers.
+  llvm::InitializeAllTargetInfos();
+  llvm::InitializeAllTargetMCs();
+  llvm::InitializeAllAsmParsers();
+
+  // Enable printing of available targets when flag --version is specified.
+  cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion);
+
+  // Parse flags and initialize target options.
+  cl::ParseCommandLineOptions(argc, argv,
+                              "llvm machine code performance analyzer.\n");
+  MCTargetOptions MCOptions;
+  MCOptions.PreserveAsmComments = false;
+
+  // Get the target from the triple. If a triple is not specified, then select
+  // the default triple for the host. If the triple doesn't correspond to any
+  // registered target, then exit with an error message.
+  const char *ProgName = argv[0];
+  const Target *TheTarget = GetTarget(ProgName);
+  if (!TheTarget)
+    return 1;
+
+  // GetTarget() may replaced TripleName with a default triple.
+  // For safety, reconstruct the Triple object.
+  Triple TheTriple(TripleName);
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BufferPtr =
+      MemoryBuffer::getFileOrSTDIN(InputFilename);
+  if (std::error_code EC = BufferPtr.getError()) {
+    errs() << InputFilename << ": " << EC.message() << '\n';
+    return 1;
+  }
+
+  SourceMgr SrcMgr;
+
+  // Tell SrcMgr about this buffer, which is what the parser will pick up.
+  SrcMgr.AddNewSourceBuffer(std::move(*BufferPtr), SMLoc());
+
+  std::unique_ptr<MCRegisterInfo> MRI(TheTarget->createMCRegInfo(TripleName));
+  assert(MRI && "Unable to create target register info!");
+
+  std::unique_ptr<MCAsmInfo> MAI(TheTarget->createMCAsmInfo(*MRI, TripleName));
+  assert(MAI && "Unable to create target asm info!");
+
+  MCObjectFileInfo MOFI;
+  MCContext Ctx(MAI.get(), MRI.get(), &MOFI, &SrcMgr);
+  MOFI.InitMCObjectFileInfo(TheTriple, /* PIC= */ false, Ctx);
+
+  if (!MainFileName.empty())
+    Ctx.setMainFileName(MainFileName);
+
+  // Package up features to be passed to target/subtarget
+  std::string FeaturesStr;
+  if (MAttrs.size()) {
+    SubtargetFeatures Features;
+    for (unsigned i = 0; i != MAttrs.size(); ++i)
+      Features.AddFeature(MAttrs[i]);
+    FeaturesStr = Features.getString();
+  }
+
+  std::unique_ptr<buffer_ostream> BOS;
+  std::unique_ptr<mca::SourceMgr> S =
+      llvm::make_unique<mca::SourceMgr>(Iterations);
+  MCStreamerWrapper Str(Ctx, S->GetSequence());
+
+  std::unique_ptr<MCInstrInfo> MCII(TheTarget->createMCInstrInfo());
+  std::unique_ptr<MCSubtargetInfo> STI(
+      TheTarget->createMCSubtargetInfo(TripleName, MCPU, FeaturesStr));
+  if (!STI->isCPUStringValid(MCPU))
+    return 1;
+
+  if (!STI->getSchedModel().isOutOfOrder()) {
+    errs() << "error: please specify an out-of-order cpu. '" << MCPU
+           << "' is an in-order cpu.\n";
+    return 1;
+  }
+
+  if (!STI->getSchedModel().hasInstrSchedModel()) {
+    errs()
+        << "error: unable to find instruction-level scheduling information for"
+        << " target triple '" << TheTriple.normalize() << "' and cpu '" << MCPU
+        << "'.\n";
+
+    if (STI->getSchedModel().InstrItineraries)
+      errs() << "note: cpu '" << MCPU << "' provides itineraries. However, "
+             << "instruction itineraries are currently unsupported.\n";
+    return 1;
+  }
+
+  std::unique_ptr<MCInstPrinter> IP(TheTarget->createMCInstPrinter(
+      Triple(TripleName), OutputAsmVariant, *MAI, *MCII, *MRI));
+  if (!IP) {
+    errs() << "error: unable to create instruction printer for target triple '"
+           << TheTriple.normalize() << "' with assembly variant "
+           << OutputAsmVariant << ".\n";
+    return 1;
+  }
+
+  int Res = AssembleInput(ProgName, TheTarget, SrcMgr, Ctx, Str, *MAI, *STI,
+                          *MCII, MCOptions);
+  if (Res)
+    return Res;
+
+  if (S->IsEmpty()) {
+    errs() << "error: no assembly instructions found.\n";
+    return 1;
+  }
+
+  const MCSchedModel &SM = STI->getSchedModel();
+
+  unsigned Width = SM.IssueWidth;
+  if (DispatchWidth)
+    Width = DispatchWidth;
+
+  std::unique_ptr<mca::Backend> B = llvm::make_unique<mca::Backend>(
+      *STI, *MCII, *MRI, std::move(S), Width, RegisterFileSize, MaxRetirePerCycle,
+      LoadQueueSize, StoreQueueSize, AssumeNoAlias);
+
+  std::unique_ptr<mca::BackendPrinter> Printer =
+      llvm::make_unique<mca::BackendPrinter>(*B, OutputFilename, std::move(IP),
+                                             PrintModeVerbose);
+  Printer->AddResourcePressureView();
+  if (PrintTimelineView)
+    Printer->AddTimelineView(TimelineMaxIterations, TimelineMaxCycles);
+
+  B->Run();
+  Printer->print();
+  return 0;
+}
Index: utils/TableGen/SubtargetEmitter.cpp
===================================================================
--- utils/TableGen/SubtargetEmitter.cpp
+++ utils/TableGen/SubtargetEmitter.cpp
@@ -616,7 +616,7 @@
   OS << "static const llvm::MCProcResourceDesc " << ProcModel.ModelName
      << "ProcResources"
      << "[] = {\n"
-     << "  {DBGFIELD(\"InvalidUnit\")     0, 0, 0, 0},\n";
+     << "  {\"InvalidUnit\", 0, 0, 0, 0},\n";
 
   unsigned SubUnitsOffset = 1;
   for (unsigned i = 0, e = ProcModel.ProcResourceDefs.size(); i < e; ++i) {
@@ -644,8 +644,9 @@
       }
       NumUnits = PRDef->getValueAsInt("NumUnits");
     }
+
     // Emit the ProcResourceDesc
-    OS << "  {DBGFIELD(\"" << PRDef->getName() << "\") ";
+    OS << "  {\"" << PRDef->getName() << "\", ";
     if (PRDef->getName().size() < 15)
       OS.indent(15 - PRDef->getName().size());
     OS << NumUnits << ", " << SuperIdx << ", " << BufferSize << ", ";