Index: docs/CommandGuide/llvm-mca.rst
===================================================================
--- docs/CommandGuide/llvm-mca.rst
+++ docs/CommandGuide/llvm-mca.rst
@@ -466,13 +466,13 @@
   Dynamic Dispatch Stall Cycles:
   RAT     - Register unavailable:                      0
   RCU     - Retire tokens unavailable:                 0
-  SCHEDQ  - Scheduler full:                            272
+  SCHEDQ  - Scheduler full:                            272  (44.6%)
   LQ      - Load queue full:                           0
   SQ      - Store queue full:                          0
   GROUP   - Static restrictions on the dispatch group: 0
 
 
-  Dispatch Logic - number of cycles where we saw N instructions dispatched:
+  Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
   [# dispatched], [# cycles]
    0,              24  (3.9%)
    1,              272  (44.6%)
@@ -520,12 +520,11 @@
 
 If we look at the *Dynamic Dispatch Stall Cycles* table, we see the counter for
 SCHEDQ reports 272 cycles.  This counter is incremented every time the dispatch
-logic is unable to dispatch a group of two instructions because the scheduler's
-queue is full.
+logic is unable to dispatch a full group because the scheduler's queue is full.
 
 Looking at the *Dispatch Logic* table, we see that the pipeline was only able to
-dispatch two instructions 51.5% of the time.  The dispatch group was limited to
-one instruction 44.6% of the cycles, which corresponds to 272 cycles.  The
+dispatch two micro opcodes 51.5% of the time.  The dispatch group was limited to
+one micro opcode 44.6% of the cycles, which corresponds to 272 cycles.  The
 dispatch statistics are displayed by either using the command option
 ``-all-stats`` or ``-dispatch-stats``.
 
Index: test/tools/llvm-mca/X86/BtVer2/register-files-1.s
===================================================================
--- test/tools/llvm-mca/X86/BtVer2/register-files-1.s
+++ test/tools/llvm-mca/X86/BtVer2/register-files-1.s
@@ -19,7 +19,7 @@
 # CHECK-NEXT: SQ      - Store queue full:                          0
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
-# CHECK:      Dispatch Logic - number of cycles where we saw N instructions dispatched:
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
 # CHECK-NEXT:  0,              23  (82.1%)
 # CHECK-NEXT:  2,              5  (17.9%)
Index: test/tools/llvm-mca/X86/BtVer2/register-files-2.s
===================================================================
--- test/tools/llvm-mca/X86/BtVer2/register-files-2.s
+++ test/tools/llvm-mca/X86/BtVer2/register-files-2.s
@@ -12,14 +12,14 @@
 # CHECK-NEXT: Block RThroughput: 1.0
 
 # CHECK:      Dynamic Dispatch Stall Cycles:
-# CHECK-NEXT: RAT     - Register unavailable:                      13
+# CHECK-NEXT: RAT     - Register unavailable:                      13  (46.4%)
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
 # CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
 # CHECK-NEXT: LQ      - Load queue full:                           0
 # CHECK-NEXT: SQ      - Store queue full:                          0
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
-# CHECK:      Dispatch Logic - number of cycles where we saw N instructions dispatched:
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
 # CHECK-NEXT:  0,              20  (71.4%)
 # CHECK-NEXT:  1,              6  (21.4%)
Index: test/tools/llvm-mca/X86/BtVer2/register-files-3.s
===================================================================
--- test/tools/llvm-mca/X86/BtVer2/register-files-3.s
+++ test/tools/llvm-mca/X86/BtVer2/register-files-3.s
@@ -22,17 +22,17 @@
 # CHECK-NEXT:  2      25    25.00                 U     idivl	%eax
 
 # CHECK:      Dynamic Dispatch Stall Cycles:
-# CHECK-NEXT: RAT     - Register unavailable:                      26
+# CHECK-NEXT: RAT     - Register unavailable:                      26  (47.3%)
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
 # CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
 # CHECK-NEXT: LQ      - Load queue full:                           0
 # CHECK-NEXT: SQ      - Store queue full:                          0
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
-# CHECK:      Dispatch Logic - number of cycles where we saw N instructions dispatched:
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
 # CHECK-NEXT:  0,              53  (96.4%)
-# CHECK-NEXT:  1,              2  (3.6%)
+# CHECK-NEXT:  2,              2  (3.6%)
 
 # CHECK:      Register File statistics:
 # CHECK-NEXT: Total number of mappings created:    6
Index: test/tools/llvm-mca/X86/BtVer2/register-files-4.s
===================================================================
--- test/tools/llvm-mca/X86/BtVer2/register-files-4.s
+++ test/tools/llvm-mca/X86/BtVer2/register-files-4.s
@@ -22,17 +22,17 @@
 # CHECK-NEXT:  2      25    25.00                 U     idivl	%eax
 
 # CHECK:      Dynamic Dispatch Stall Cycles:
-# CHECK-NEXT: RAT     - Register unavailable:                      6
+# CHECK-NEXT: RAT     - Register unavailable:                      6  (1.1%)
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
 # CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
 # CHECK-NEXT: LQ      - Load queue full:                           0
 # CHECK-NEXT: SQ      - Store queue full:                          0
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
-# CHECK:      Dispatch Logic - number of cycles where we saw N instructions dispatched:
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
 # CHECK-NEXT:  0,              531  (96.0%)
-# CHECK-NEXT:  1,              22  (4.0%)
+# CHECK-NEXT:  2,              22  (4.0%)
 
 # CHECK:      Register File statistics:
 # CHECK-NEXT: Total number of mappings created:    66
Index: test/tools/llvm-mca/X86/BtVer2/register-files-5.s
===================================================================
--- test/tools/llvm-mca/X86/BtVer2/register-files-5.s
+++ test/tools/llvm-mca/X86/BtVer2/register-files-5.s
@@ -44,16 +44,16 @@
 
 # CHECK:      Dynamic Dispatch Stall Cycles:
 # CHECK-NEXT: RAT     - Register unavailable:                      0
-# CHECK-NEXT: RCU     - Retire tokens unavailable:                 8
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 8  (11.6%)
 # CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
 # CHECK-NEXT: LQ      - Load queue full:                           0
 # CHECK-NEXT: SQ      - Store queue full:                          0
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
-# CHECK:      Dispatch Logic - number of cycles where we saw N instructions dispatched:
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
 # CHECK-NEXT:  0,              36  (52.2%)
-# CHECK-NEXT:  1,              33  (47.8%)
+# CHECK-NEXT:  2,              33  (47.8%)
 
 # CHECK:      Register File statistics:
 # CHECK-NEXT: Total number of mappings created:    66
Index: test/tools/llvm-mca/X86/Haswell/cmpxchg16b.s
===================================================================
--- test/tools/llvm-mca/X86/Haswell/cmpxchg16b.s
+++ test/tools/llvm-mca/X86/Haswell/cmpxchg16b.s
@@ -0,0 +1,73 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -timeline -timeline-max-iterations=3 -dispatch-stats < %s | FileCheck %s
+
+cmpxchg16b (%rsi)
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      100
+# CHECK-NEXT: Total Cycles:      2203
+# CHECK-NEXT: Dispatch Width:    4
+# CHECK-NEXT: IPC:               0.05
+# CHECK-NEXT: Block RThroughput: 4.8
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  19     22    4.00    *      *            cmpxchg16b	(%rsi)
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      0
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 1487  (67.5%)
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              1703  (77.3%)
+# CHECK-NEXT:  3,              100  (4.5%)
+# CHECK-NEXT:  4,              400  (18.2%)
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - HWDivider
+# CHECK-NEXT: [1]   - HWFPDivider
+# CHECK-NEXT: [2]   - HWPort0
+# CHECK-NEXT: [3]   - HWPort1
+# CHECK-NEXT: [4]   - HWPort2
+# CHECK-NEXT: [5]   - HWPort3
+# CHECK-NEXT: [6]   - HWPort4
+# CHECK-NEXT: [7]   - HWPort5
+# CHECK-NEXT: [8]   - HWPort6
+# CHECK-NEXT: [9]   - HWPort7
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+# CHECK-NEXT:  -      -     2.00   6.00   0.66   0.67   1.00   4.00   4.00   0.67
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
+# CHECK-NEXT:  -      -     2.00   6.00   0.66   0.67   1.00   4.00   4.00   0.67   cmpxchg16b	(%rsi)
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeeeeeeeeeeeeeeeeeeeER.    .    .    .    .    .    .    .    .  .   cmpxchg16b	(%rsi)
+# CHECK-NEXT: [1,0]     .    D=================eeeeeeeeeeeeeeeeeeeeeeER   .    .    .    .  .   cmpxchg16b	(%rsi)
+# CHECK-NEXT: [2,0]     .    .    D==================================eeeeeeeeeeeeeeeeeeeeeeER   cmpxchg16b	(%rsi)
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     18.0   0.3    0.0       cmpxchg16b	(%rsi)
Index: test/tools/llvm-mca/X86/option-all-stats-1.s
===================================================================
--- test/tools/llvm-mca/X86/option-all-stats-1.s
+++ test/tools/llvm-mca/X86/option-all-stats-1.s
@@ -27,12 +27,12 @@
 # FULLREPORT:      Dynamic Dispatch Stall Cycles:
 # FULLREPORT-NEXT: RAT     - Register unavailable:                      0
 # FULLREPORT-NEXT: RCU     - Retire tokens unavailable:                 0
-# FULLREPORT-NEXT: SCHEDQ  - Scheduler full:                            61
+# FULLREPORT-NEXT: SCHEDQ  - Scheduler full:                            61  (59.2%)
 # FULLREPORT-NEXT: LQ      - Load queue full:                           0
 # FULLREPORT-NEXT: SQ      - Store queue full:                          0
 # FULLREPORT-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
-# FULLREPORT:      Dispatch Logic - number of cycles where we saw N instructions dispatched:
+# FULLREPORT:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # FULLREPORT-NEXT: [# dispatched], [# cycles]
 # FULLREPORT-NEXT:  0,              22  (21.4%)
 # FULLREPORT-NEXT:  1,              62  (60.2%)
Index: test/tools/llvm-mca/X86/option-all-stats-2.s
===================================================================
--- test/tools/llvm-mca/X86/option-all-stats-2.s
+++ test/tools/llvm-mca/X86/option-all-stats-2.s
@@ -28,12 +28,12 @@
 # FULL:      Dynamic Dispatch Stall Cycles:
 # FULL-NEXT: RAT     - Register unavailable:                      0
 # FULL-NEXT: RCU     - Retire tokens unavailable:                 0
-# FULL-NEXT: SCHEDQ  - Scheduler full:                            61
+# FULL-NEXT: SCHEDQ  - Scheduler full:                            61  (59.2%)
 # FULL-NEXT: LQ      - Load queue full:                           0
 # FULL-NEXT: SQ      - Store queue full:                          0
 # FULL-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
-# FULL:      Dispatch Logic - number of cycles where we saw N instructions dispatched:
+# FULL:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # FULL-NEXT: [# dispatched], [# cycles]
 # FULL-NEXT:  0,              22  (21.4%)
 # FULL-NEXT:  1,              62  (60.2%)
Index: test/tools/llvm-mca/X86/option-all-views-1.s
===================================================================
--- test/tools/llvm-mca/X86/option-all-views-1.s
+++ test/tools/llvm-mca/X86/option-all-views-1.s
@@ -29,12 +29,12 @@
 # FULLREPORT:         Dynamic Dispatch Stall Cycles:
 # FULLREPORT-NEXT:    RAT     - Register unavailable:                      0
 # FULLREPORT-NEXT:    RCU     - Retire tokens unavailable:                 0
-# FULLREPORT-NEXT:    SCHEDQ  - Scheduler full:                            61
+# FULLREPORT-NEXT:    SCHEDQ  - Scheduler full:                            61  (59.2%)
 # FULLREPORT-NEXT:    LQ      - Load queue full:                           0
 # FULLREPORT-NEXT:    SQ      - Store queue full:                          0
 # FULLREPORT-NEXT:    GROUP   - Static restrictions on the dispatch group: 0
 
-# FULLREPORT:         Dispatch Logic - number of cycles where we saw N instructions dispatched:
+# FULLREPORT:         Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # FULLREPORT-NEXT:    [# dispatched], [# cycles]
 # FULLREPORT-NEXT:     0,              22  (21.4%)
 # FULLREPORT-NEXT:     1,              62  (60.2%)
Index: test/tools/llvm-mca/X86/option-all-views-2.s
===================================================================
--- test/tools/llvm-mca/X86/option-all-views-2.s
+++ test/tools/llvm-mca/X86/option-all-views-2.s
@@ -28,12 +28,12 @@
 # ALL:             Dynamic Dispatch Stall Cycles:
 # ALL-NEXT:        RAT     - Register unavailable:                      0
 # ALL-NEXT:        RCU     - Retire tokens unavailable:                 0
-# ALL-NEXT:        SCHEDQ  - Scheduler full:                            61
+# ALL-NEXT:        SCHEDQ  - Scheduler full:                            61  (59.2%)
 # ALL-NEXT:        LQ      - Load queue full:                           0
 # ALL-NEXT:        SQ      - Store queue full:                          0
 # ALL-NEXT:        GROUP   - Static restrictions on the dispatch group: 0
 
-# ALL:             Dispatch Logic - number of cycles where we saw N instructions dispatched:
+# ALL:             Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # ALL-NEXT:        [# dispatched], [# cycles]
 # ALL-NEXT:         0,              22  (21.4%)
 # ALL-NEXT:         1,              62  (60.2%)
Index: tools/llvm-mca/Views/DispatchStatistics.h
===================================================================
--- tools/llvm-mca/Views/DispatchStatistics.h
+++ tools/llvm-mca/Views/DispatchStatistics.h
@@ -24,7 +24,7 @@
 /// GROUP   - Static restrictions on the dispatch group: 0
 ///
 ///
-/// Dispatch Logic - number of cycles where we saw N instructions dispatched:
+/// Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 /// [# dispatched], [# cycles]
 ///  0,              15  (11.5%)
 ///  2,              4  (3.1%)
Index: tools/llvm-mca/Views/DispatchStatistics.cpp
===================================================================
--- tools/llvm-mca/Views/DispatchStatistics.cpp
+++ tools/llvm-mca/Views/DispatchStatistics.cpp
@@ -26,20 +26,23 @@
 }
 
 void DispatchStatistics::onEvent(const HWInstructionEvent &Event) {
-  if (Event.Type == HWInstructionEvent::Dispatched)
-    ++NumDispatched;
+  if (Event.Type != HWInstructionEvent::Dispatched)
+    return;
+
+  const auto &DE = static_cast<const HWInstructionDispatchedEvent &>(Event);
+  NumDispatched += DE.MicroOpcodes;
 }
 
 void DispatchStatistics::printDispatchHistogram(llvm::raw_ostream &OS) const {
   std::string Buffer;
   raw_string_ostream TempStream(Buffer);
   TempStream << "\n\nDispatch Logic - "
-             << "number of cycles where we saw N instructions dispatched:\n";
+             << "number of cycles where we saw N micro opcodes dispatched:\n";
   TempStream << "[# dispatched], [# cycles]\n";
   for (const std::pair<unsigned, unsigned> &Entry : DispatchGroupSizePerCycle) {
+    double Percentage = ((double)Entry.second / NumCycles) * 100.0;
     TempStream << " " << Entry.first << ",              " << Entry.second
-               << "  ("
-               << format("%.1f", ((double)Entry.second / NumCycles) * 100.0)
+               << "  (" << format("%.1f", floor((Percentage * 10) + 0.5) / 10)
                << "%)\n";
   }
 
@@ -47,24 +50,36 @@
   OS << Buffer;
 }
 
+static void printStalls(raw_ostream &OS, unsigned NumStalls,
+                        unsigned NumCycles) {
+  if (!NumStalls) {
+    OS << NumStalls;
+    return;
+  }
+
+  double Percentage = ((double)NumStalls / NumCycles) * 100.0;
+  OS << NumStalls << "  ("
+     << format("%.1f", floor((Percentage * 10) + 0.5) / 10) << "%)";
+}
+
 void DispatchStatistics::printDispatchStalls(raw_ostream &OS) const {
   std::string Buffer;
-  raw_string_ostream TempStream(Buffer);
-  TempStream << "\n\nDynamic Dispatch Stall Cycles:\n";
-  TempStream << "RAT     - Register unavailable:                      "
-             << HWStalls[HWStallEvent::RegisterFileStall];
-  TempStream << "\nRCU     - Retire tokens unavailable:                 "
-             << HWStalls[HWStallEvent::RetireControlUnitStall];
-  TempStream << "\nSCHEDQ  - Scheduler full:                            "
-             << HWStalls[HWStallEvent::SchedulerQueueFull];
-  TempStream << "\nLQ      - Load queue full:                           "
-             << HWStalls[HWStallEvent::LoadQueueFull];
-  TempStream << "\nSQ      - Store queue full:                          "
-             << HWStalls[HWStallEvent::StoreQueueFull];
-  TempStream << "\nGROUP   - Static restrictions on the dispatch group: "
-             << HWStalls[HWStallEvent::DispatchGroupStall];
-  TempStream << '\n';
-  TempStream.flush();
+  raw_string_ostream SS(Buffer);
+  SS << "\n\nDynamic Dispatch Stall Cycles:\n";
+  SS << "RAT     - Register unavailable:                      ";
+  printStalls(SS, HWStalls[HWStallEvent::RegisterFileStall], NumCycles);
+  SS << "\nRCU     - Retire tokens unavailable:                 ";
+  printStalls(SS, HWStalls[HWStallEvent::RetireControlUnitStall], NumCycles);
+  SS << "\nSCHEDQ  - Scheduler full:                            ";
+  printStalls(SS, HWStalls[HWStallEvent::SchedulerQueueFull], NumCycles);
+  SS << "\nLQ      - Load queue full:                           ";
+  printStalls(SS, HWStalls[HWStallEvent::LoadQueueFull], NumCycles);
+  SS << "\nSQ      - Store queue full:                          ";
+  printStalls(SS, HWStalls[HWStallEvent::StoreQueueFull], NumCycles);
+  SS << "\nGROUP   - Static restrictions on the dispatch group: ";
+  printStalls(SS, HWStalls[HWStallEvent::DispatchGroupStall], NumCycles);
+  SS << '\n';
+  SS.flush();
   OS << Buffer;
 }
 
Index: tools/llvm-mca/Views/SummaryView.cpp
===================================================================
--- tools/llvm-mca/Views/SummaryView.cpp
+++ tools/llvm-mca/Views/SummaryView.cpp
@@ -33,12 +33,10 @@
 }
 
 void SummaryView::onEvent(const HWInstructionEvent &Event) {
-  // We are only interested in the "instruction dispatched" events generated by
-  // the dispatch stage for instructions that are part of iteration #0.
-  if (Event.Type != HWInstructionEvent::Dispatched)
-    return;
-
-  if (Event.IR.getSourceIndex() >= Source.size())
+  // We are only interested in the "instruction retired" events generated by
+  // the retire stage for instructions that are part of iteration #0.
+  if (Event.Type != HWInstructionEvent::Retired ||
+      Event.IR.getSourceIndex() >= Source.size())
     return;
 
   // Update the cumulative number of resource cycles based on the processor
Index: tools/llvm-mca/Views/TimelineView.h
===================================================================
--- tools/llvm-mca/Views/TimelineView.h
+++ tools/llvm-mca/Views/TimelineView.h
@@ -126,7 +126,7 @@
   unsigned LastCycle;
 
   struct TimelineViewEntry {
-    unsigned CycleDispatched;
+    int CycleDispatched;  // A negative value is an "invalid cycle".
     unsigned CycleReady;
     unsigned CycleIssued;
     unsigned CycleExecuted;
Index: tools/llvm-mca/Views/TimelineView.cpp
===================================================================
--- tools/llvm-mca/Views/TimelineView.cpp
+++ tools/llvm-mca/Views/TimelineView.cpp
@@ -29,6 +29,8 @@
     MaxIterations = DEFAULT_ITERATIONS;
   NumInstructions *= std::min(MaxIterations, AsmSequence.getNumIterations());
   Timeline.resize(NumInstructions);
+  TimelineViewEntry InvalidTVEntry = {-1, 0, 0, 0};
+  std::fill(Timeline.begin(), Timeline.end(), InvalidTVEntry);
 
   WaitTimeEntry NullWTEntry = {0, 0, 0};
   std::fill(WaitTime.begin(), WaitTime.end(), NullWTEntry);
@@ -68,10 +70,13 @@
       TVEntry.CycleRetired = CurrentCycle;
 
     // Update the WaitTime entry which corresponds to this Index.
+    assert(TVEntry.CycleDispatched >= 0 && "Invalid TVEntry found!");
+    unsigned CycleDispatched = static_cast<unsigned>(TVEntry.CycleDispatched);
     WaitTimeEntry &WTEntry = WaitTime[Index % AsmSequence.size()];
     WTEntry.CyclesSpentInSchedulerQueue +=
-        TVEntry.CycleIssued - TVEntry.CycleDispatched;
-    assert(TVEntry.CycleDispatched <= TVEntry.CycleReady);
+        TVEntry.CycleIssued - CycleDispatched;
+    assert(CycleDispatched <= TVEntry.CycleReady &&
+           "Instruction cannot be ready if it hasn't been dispatched yet!");
     WTEntry.CyclesSpentInSQWhileReady +=
         TVEntry.CycleIssued - TVEntry.CycleReady;
     WTEntry.CyclesSpentAfterWBAndBeforeRetire +=
@@ -88,7 +93,11 @@
     Timeline[Index].CycleExecuted = CurrentCycle;
     break;
   case HWInstructionEvent::Dispatched:
-    Timeline[Index].CycleDispatched = CurrentCycle;
+    // There may be multiple dispatch events. Microcoded instructions that are
+    // expanded into multiple uOps may require multiple dispatch cycles. Here,
+    // we want to capture the first dispatch cycle.
+    if (Timeline[Index].CycleDispatched == -1)
+      Timeline[Index].CycleDispatched = static_cast<int>(CurrentCycle);
     break;
   default:
     return;
@@ -193,19 +202,20 @@
     OS << '\n';
   OS << '[' << Iteration << ',' << SourceIndex << ']';
   OS.PadToColumn(10);
-  for (unsigned I = 0, E = Entry.CycleDispatched; I < E; ++I)
+  assert(Entry.CycleDispatched >= 0 && "Invalid TimelineViewEntry!");
+  unsigned CycleDispatched = static_cast<unsigned>(Entry.CycleDispatched);
+  for (unsigned I = 0, E = CycleDispatched; I < E; ++I)
     OS << ((I % 5 == 0) ? '.' : ' ');
   OS << TimelineView::DisplayChar::Dispatched;
-  if (Entry.CycleDispatched != Entry.CycleExecuted) {
+  if (CycleDispatched != Entry.CycleExecuted) {
     // Zero latency instructions have the same value for CycleDispatched,
     // CycleIssued and CycleExecuted.
-    for (unsigned I = Entry.CycleDispatched + 1, E = Entry.CycleIssued; I < E;
-         ++I)
+    for (unsigned I = CycleDispatched + 1, E = Entry.CycleIssued; I < E; ++I)
       OS << TimelineView::DisplayChar::Waiting;
     if (Entry.CycleIssued == Entry.CycleExecuted)
       OS << TimelineView::DisplayChar::DisplayChar::Executed;
     else {
-      if (Entry.CycleDispatched != Entry.CycleIssued)
+      if (CycleDispatched != Entry.CycleIssued)
         OS << TimelineView::DisplayChar::Executing;
       for (unsigned I = Entry.CycleIssued + 1, E = Entry.CycleExecuted; I < E;
            ++I)
Index: tools/llvm-mca/include/HWEventListener.h
===================================================================
--- tools/llvm-mca/include/HWEventListener.h
+++ tools/llvm-mca/include/HWEventListener.h
@@ -70,12 +70,23 @@
 
 class HWInstructionDispatchedEvent : public HWInstructionEvent {
 public:
-  HWInstructionDispatchedEvent(const InstRef &IR, llvm::ArrayRef<unsigned> Regs)
+  HWInstructionDispatchedEvent(const InstRef &IR, llvm::ArrayRef<unsigned> Regs,
+                               unsigned UOps)
       : HWInstructionEvent(HWInstructionEvent::Dispatched, IR),
-        UsedPhysRegs(Regs) {}
+        UsedPhysRegs(Regs), MicroOpcodes(UOps) {}
   // Number of physical register allocated for this instruction. There is one
   // entry per register file.
   llvm::ArrayRef<unsigned> UsedPhysRegs;
+  // Number of micro opcodes dispatched.
+  // This field is often set to the total number of micro-opcodes specified by
+  // the instruction descriptor of IR.
+  // The only exception is when IR declares a number of micro opcodes
+  // which exceeds the processor DispatchWidth, and - by construction - it
+  // requires multiple cycles to be fully dispatched. In that particular case,
+  // the dispatch logic would generate more than one dispatch event (one per
+  // cycle), and each event would declare how many micro opcodes are effectively
+  // been dispatched to the schedulers.
+  unsigned MicroOpcodes;
 };
 
 class HWInstructionRetiredEvent : public HWInstructionEvent {
Index: tools/llvm-mca/include/Stages/DispatchStage.h
===================================================================
--- tools/llvm-mca/include/Stages/DispatchStage.h
+++ tools/llvm-mca/include/Stages/DispatchStage.h
@@ -51,6 +51,7 @@
   unsigned DispatchWidth;
   unsigned AvailableEntries;
   unsigned CarryOver;
+  InstRef CarriedOver;
   const llvm::MCSubtargetInfo &STI;
   RetireControlUnit &RCU;
   RegisterFile &PRF;
@@ -63,7 +64,8 @@
   void updateRAWDependencies(ReadState &RS, const llvm::MCSubtargetInfo &STI);
 
   void notifyInstructionDispatched(const InstRef &IR,
-                                   llvm::ArrayRef<unsigned> UsedPhysRegs);
+                                   llvm::ArrayRef<unsigned> UsedPhysRegs,
+                                   unsigned uOps);
 
   void collectWrites(llvm::SmallVectorImpl<WriteRef> &Vec,
                      unsigned RegID) const {
@@ -75,7 +77,7 @@
                 const llvm::MCRegisterInfo &MRI, unsigned MaxDispatchWidth,
                 RetireControlUnit &R, RegisterFile &F)
       : DispatchWidth(MaxDispatchWidth), AvailableEntries(MaxDispatchWidth),
-        CarryOver(0U), STI(Subtarget), RCU(R), PRF(F) {}
+        CarryOver(0U), CarriedOver(), STI(Subtarget), RCU(R), PRF(F) {}
 
   bool isAvailable(const InstRef &IR) const override;
 
Index: tools/llvm-mca/lib/Stages/DispatchStage.cpp
===================================================================
--- tools/llvm-mca/lib/Stages/DispatchStage.cpp
+++ tools/llvm-mca/lib/Stages/DispatchStage.cpp
@@ -28,9 +28,11 @@
 namespace mca {
 
 void DispatchStage::notifyInstructionDispatched(const InstRef &IR,
-                                                ArrayRef<unsigned> UsedRegs) {
+                                                ArrayRef<unsigned> UsedRegs,
+                                                unsigned UOps) {
   LLVM_DEBUG(dbgs() << "[E] Instruction Dispatched: #" << IR << '\n');
-  notifyEvent<HWInstructionEvent>(HWInstructionDispatchedEvent(IR, UsedRegs));
+  notifyEvent<HWInstructionEvent>(
+      HWInstructionDispatchedEvent(IR, UsedRegs, UOps));
 }
 
 bool DispatchStage::checkPRF(const InstRef &IR) const {
@@ -92,6 +94,7 @@
     assert(AvailableEntries == DispatchWidth);
     AvailableEntries = 0;
     CarryOver = NumMicroOps - DispatchWidth;
+    CarriedOver = IR;
   } else {
     assert(AvailableEntries >= NumMicroOps);
     AvailableEntries -= NumMicroOps;
@@ -125,13 +128,26 @@
 
   // Notify listeners of the "instruction dispatched" event,
   // and move IR to the next stage.
-  notifyInstructionDispatched(IR, RegisterFiles);
+  notifyInstructionDispatched(IR, RegisterFiles,
+                              std::min(DispatchWidth, NumMicroOps));
   return moveToTheNextStage(IR);
 }
 
 llvm::Error DispatchStage::cycleStart() {
+  if (!CarryOver) {
+    AvailableEntries = DispatchWidth;
+    return llvm::ErrorSuccess();
+  }
+
   AvailableEntries = CarryOver >= DispatchWidth ? 0 : DispatchWidth - CarryOver;
-  CarryOver = CarryOver >= DispatchWidth ? CarryOver - DispatchWidth : 0U;
+  unsigned DispatchedOpcodes = DispatchWidth - AvailableEntries;
+  CarryOver -= DispatchedOpcodes;
+  assert(CarriedOver.isValid() && "Invalid dispatched instruction");
+  
+  SmallVector<unsigned, 8> RegisterFiles(PRF.getNumRegisterFiles(), 0U);
+  notifyInstructionDispatched(CarriedOver, RegisterFiles, DispatchedOpcodes);
+  if (!CarryOver)
+    CarriedOver = InstRef();
   return llvm::ErrorSuccess();
 }