diff --git a/llvm/docs/CommandGuide/llvm-mca.rst b/llvm/docs/CommandGuide/llvm-mca.rst
--- a/llvm/docs/CommandGuide/llvm-mca.rst
+++ b/llvm/docs/CommandGuide/llvm-mca.rst
@@ -16,8 +16,8 @@
 of machine code in a specific CPU.
 
 Performance is measured in terms of throughput as well as processor resource
-consumption. The tool currently works for processors with an out-of-order
-backend, for which there is a scheduling model available in LLVM.
+consumption. The tool currently works for processors with a backend for which
+there is a scheduling model available in LLVM.
 
 The main goal of this tool is not just to predict the performance of the code
 when run on the target, but also help with diagnosing potential performance
@@ -204,7 +204,8 @@
 
   Print information about bottlenecks that affect the throughput. This analysis
   can be expensive, and it is disabled by default.  Bottlenecks are highlighted
-  in the summary view.
+  in the summary view. Bottleneck analysis is currently not supported for
+  processors with an in-order backend.
 
 .. option:: -json
 
@@ -388,7 +389,9 @@
 Throughput).
 
 Field *DispatchWidth* is the maximum number of micro opcodes that are dispatched
-to the out-of-order backend every simulated cycle.
+to the out-of-order backend every simulated cycle. For processors with an
+in-order backend, *DispatchWidth* is the maximum number of micro opcodes issued
+to the backend every simulated cycle.
 
 IPC is computed dividing the total number of simulated instructions by the total
 number of cycles.
@@ -653,6 +656,8 @@
 dependent on the simulation and (as always) by the quality of the processor
 model in llvm.
 
+Bottleneck analysis is currently not supported for processors with an in-order
+backend.
 
 Extra Statistics to Further Diagnose Performance Issues
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -797,11 +802,14 @@
 * Write Back (Instruction is executed, and results are written back).
 * Retire (Instruction is retired; writes are architecturally committed).
 
-The default pipeline only models the out-of-order portion of a processor.
-Therefore, the instruction fetch and decode stages are not modeled. Performance
-bottlenecks in the frontend are not diagnosed. :program:`llvm-mca` assumes that
-instructions have all been decoded and placed into a queue before the simulation
-start.  Also, :program:`llvm-mca` does not model branch prediction.
+The in-order pipeline implements the following sequence of stages:
+* InOrderIssue (Instruction is issued to the processor pipelines).
+* Retire (Instruction is retired; writes are architecturally committed).
+
+:program:`llvm-mca` assumes that instructions have all been decoded and placed
+into a queue before the simulation start. Therefore, the instruction fetch and
+decode stages are not modeled. Performance bottlenecks in the frontend are not
+diagnosed. Also, :program:`llvm-mca` does not model branch prediction.
 
 Instruction Dispatch
 """"""""""""""""""""
@@ -957,3 +965,17 @@
 #. A load may pass a previous load.
 #. A load may not pass a previous store unless ``-noalias`` is set.
 #. A load has to wait until an older load barrier is fully executed.
+
+In-order Issue and Execute
+""""""""""""""""""""""""""""""""""""
+In-order processors are modelled as a single ``InOrderIssueStage`` stage. It
+bypasses Dispatch, Scheduler and Load/Store unit. Instructions are issued as
+soon as their operand registers are available and resource requirements are
+met. Multiple instructions can be issued in one cycle according to the value of
+the ``IssueWidth`` parameter in LLVM's scheduling model.
+
+Once issued, an instruction is moved to ``IssuedInst`` set until it is ready to
+retire. If ``RetireControlUnit`` is defined in the LLVM's scheduling model,
+:program:`llvm-mca` ensures that instructions are retired in-order. However, an
+instruction is allowed to retire out-of-order if ``RetireOOO`` property is true
+for at least one of its writes.
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -130,6 +130,9 @@
 * The options ``--build-id-link-{dir,input,output}`` have been deleted.
   (`D96310 <https://reviews.llvm.org/D96310>`_)
 
+* Support for in-order processors has been added to ``llvm-mca``.
+  (`D94928 <https://reviews.llvm.org/D94928>`_)
+
 Changes to LLDB
 ---------------------------------
 
diff --git a/llvm/include/llvm/MC/MCSchedule.h b/llvm/include/llvm/MC/MCSchedule.h
--- a/llvm/include/llvm/MC/MCSchedule.h
+++ b/llvm/include/llvm/MC/MCSchedule.h
@@ -108,15 +108,16 @@
 ///
 /// Defined as an aggregate struct for creating tables with initializer lists.
 struct MCSchedClassDesc {
-  static const unsigned short InvalidNumMicroOps = (1U << 14) - 1;
+  static const unsigned short InvalidNumMicroOps = (1U << 13) - 1;
   static const unsigned short VariantNumMicroOps = InvalidNumMicroOps - 1;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   const char* Name;
 #endif
-  uint16_t NumMicroOps : 14;
+  uint16_t NumMicroOps : 13;
   uint16_t BeginGroup : 1;
   uint16_t EndGroup : 1;
+  uint16_t RetireOOO : 1;
   uint16_t WriteProcResIdx; // First index into WriteProcResTable.
   uint16_t NumWriteProcResEntries;
   uint16_t WriteLatencyIdx; // First index into WriteLatencyTable.
diff --git a/llvm/include/llvm/MCA/Context.h b/llvm/include/llvm/MCA/Context.h
--- a/llvm/include/llvm/MCA/Context.h
+++ b/llvm/include/llvm/MCA/Context.h
@@ -68,6 +68,11 @@
   /// This pipeline consists of Fetch, Dispatch, Execute, and Retire stages.
   std::unique_ptr<Pipeline> createDefaultPipeline(const PipelineOptions &Opts,
                                                   SourceMgr &SrcMgr);
+
+  /// Construct a basic pipeline for simulating an in-order pipeline.
+  /// This pipeline consists of Fetch, InOrderIssue, and Retire stages.
+  std::unique_ptr<Pipeline> createInOrderPipeline(const PipelineOptions &Opts,
+                                                  SourceMgr &SrcMgr);
 };
 
 } // namespace mca
diff --git a/llvm/include/llvm/MCA/HardwareUnits/RegisterFile.h b/llvm/include/llvm/MCA/HardwareUnits/RegisterFile.h
--- a/llvm/include/llvm/MCA/HardwareUnits/RegisterFile.h
+++ b/llvm/include/llvm/MCA/HardwareUnits/RegisterFile.h
@@ -172,11 +172,6 @@
   void freePhysRegs(const RegisterRenamingInfo &Entry,
                     MutableArrayRef<unsigned> FreedPhysRegs);
 
-  // Collects writes that are in a RAW dependency with RS.
-  // This method is called from `addRegisterRead()`.
-  void collectWrites(const ReadState &RS,
-                     SmallVectorImpl<WriteRef> &Writes) const;
-
   // Create an instance of RegisterMappingTracker for every register file
   // specified by the processor model.
   // If no register file is specified, then this method creates a default
@@ -187,6 +182,10 @@
   RegisterFile(const MCSchedModel &SM, const MCRegisterInfo &mri,
                unsigned NumRegs = 0);
 
+  // Collects writes that are in a RAW dependency with RS.
+  void collectWrites(const ReadState &RS,
+                     SmallVectorImpl<WriteRef> &Writes) const;
+
   // This method updates the register mappings inserting a new register
   // definition. This method is also responsible for updating the number of
   // allocated physical registers in each register file modified by the write.
diff --git a/llvm/include/llvm/MCA/HardwareUnits/RetireControlUnit.h b/llvm/include/llvm/MCA/HardwareUnits/RetireControlUnit.h
--- a/llvm/include/llvm/MCA/HardwareUnits/RetireControlUnit.h
+++ b/llvm/include/llvm/MCA/HardwareUnits/RetireControlUnit.h
@@ -104,6 +104,9 @@
 #ifndef NDEBUG
   void dump() const;
 #endif
+
+  // Assigned to instructions that are not handled by the RCU.
+  static const unsigned UnhandledTokenID = ~0U;
 };
 
 } // namespace mca
diff --git a/llvm/include/llvm/MCA/Instruction.h b/llvm/include/llvm/MCA/Instruction.h
--- a/llvm/include/llvm/MCA/Instruction.h
+++ b/llvm/include/llvm/MCA/Instruction.h
@@ -375,6 +375,7 @@
   bool HasSideEffects;
   bool BeginGroup;
   bool EndGroup;
+  bool RetireOOO;
 
   // True if all buffered resources are in-order, and there is at least one
   // buffer which is a dispatch hazard (BufferSize = 0).
diff --git a/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h b/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h
new file mode 100644
--- /dev/null
+++ b/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h
@@ -0,0 +1,84 @@
+//===---------------------- InOrderIssueStage.h -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// InOrderIssueStage implements an in-order execution pipeline.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MCA_IN_ORDER_ISSUE_STAGE_H
+#define LLVM_MCA_IN_ORDER_ISSUE_STAGE_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MCA/SourceMgr.h"
+#include "llvm/MCA/Stages/Stage.h"
+
+#include <queue>
+
+namespace llvm {
+struct MCSchedModel;
+class MCSubtargetInfo;
+
+namespace mca {
+class RegisterFile;
+class ResourceManager;
+struct RetireControlUnit;
+
+class InOrderIssueStage final : public Stage {
+  const MCSchedModel &SM;
+  const MCSubtargetInfo &STI;
+  RetireControlUnit &RCU;
+  RegisterFile &PRF;
+  std::unique_ptr<ResourceManager> RM;
+
+  /// Instructions that were issued, but not executed yet.
+  SmallVector<InstRef, 4> IssuedInst;
+
+  /// Number of instructions issued in the current cycle.
+  unsigned NumIssued;
+
+  /// If an instruction cannot execute due to an unmet register or resource
+  /// dependency, the it is stalled for StallCyclesLeft.
+  InstRef StalledInst;
+  unsigned StallCyclesLeft;
+
+  /// Number of instructions that can be issued in the current cycle.
+  unsigned Bandwidth;
+
+  InOrderIssueStage(const InOrderIssueStage &Other) = delete;
+  InOrderIssueStage &operator=(const InOrderIssueStage &Other) = delete;
+
+  /// If IR has an unmet register or resource dependency, canExecute returns
+  /// false. StallCycles is set to the number of cycles left before the
+  /// instruction can be issued.
+  bool canExecute(const InstRef &IR, unsigned *StallCycles) const;
+
+  /// Issue the instruction, or update StallCycles if IR is stalled.
+  Error tryIssue(InstRef &IR, unsigned *StallCycles);
+
+  /// Update status of instructions from IssuedInst.
+  Error updateIssuedInst();
+
+public:
+  InOrderIssueStage(RetireControlUnit &RCU, RegisterFile &PRF,
+                    const MCSchedModel &SM, const MCSubtargetInfo &STI)
+      : SM(SM), STI(STI), RCU(RCU), PRF(PRF),
+        RM(std::make_unique<ResourceManager>(SM)), StallCyclesLeft(0),
+        Bandwidth(0) {}
+
+  bool isAvailable(const InstRef &) const override;
+  bool hasWorkToComplete() const override;
+  Error execute(InstRef &IR) override;
+  Error cycleStart() override;
+  Error cycleEnd() override;
+};
+
+} // namespace mca
+} // namespace llvm
+
+#endif // LLVM_MCA_IN_ORDER_ISSUE_STAGE_H
diff --git a/llvm/include/llvm/MCA/Stages/RetireStage.h b/llvm/include/llvm/MCA/Stages/RetireStage.h
--- a/llvm/include/llvm/MCA/Stages/RetireStage.h
+++ b/llvm/include/llvm/MCA/Stages/RetireStage.h
@@ -16,6 +16,7 @@
 #ifndef LLVM_MCA_STAGES_RETIRESTAGE_H
 #define LLVM_MCA_STAGES_RETIRESTAGE_H
 
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/MCA/HardwareUnits/LSUnit.h"
 #include "llvm/MCA/HardwareUnits/RegisterFile.h"
 #include "llvm/MCA/HardwareUnits/RetireControlUnit.h"
@@ -29,6 +30,7 @@
   RetireControlUnit &RCU;
   RegisterFile &PRF;
   LSUnitBase &LSU;
+  SmallVector<InstRef, 4> RetireInst;
 
   RetireStage(const RetireStage &Other) = delete;
   RetireStage &operator=(const RetireStage &Other) = delete;
@@ -37,7 +39,9 @@
   RetireStage(RetireControlUnit &R, RegisterFile &F, LSUnitBase &LS)
       : Stage(), RCU(R), PRF(F), LSU(LS) {}
 
-  bool hasWorkToComplete() const override { return !RCU.isEmpty(); }
+  bool hasWorkToComplete() const override {
+    return !RCU.isEmpty() || !RetireInst.empty();
+  }
   Error cycleStart() override;
   Error execute(InstRef &IR) override;
   void notifyInstructionRetired(const InstRef &IR) const;
diff --git a/llvm/include/llvm/Target/TargetSchedule.td b/llvm/include/llvm/Target/TargetSchedule.td
--- a/llvm/include/llvm/Target/TargetSchedule.td
+++ b/llvm/include/llvm/Target/TargetSchedule.td
@@ -262,6 +262,10 @@
   // Allow a processor to mark some scheduling classes as single-issue.
   // SingleIssue is an alias for Begin/End Group.
   bit SingleIssue = false;
+  // An instruction is allowed to retire out-of-order if RetireOOO is
+  // true for at least one of its writes. This field is only used by
+  // MCA for in-order subtargets, and is ignored for other targets.
+  bit RetireOOO = false;
   SchedMachineModel SchedModel = ?;
 }
 
diff --git a/llvm/lib/MCA/CMakeLists.txt b/llvm/lib/MCA/CMakeLists.txt
--- a/llvm/lib/MCA/CMakeLists.txt
+++ b/llvm/lib/MCA/CMakeLists.txt
@@ -14,6 +14,7 @@
   Stages/DispatchStage.cpp
   Stages/EntryStage.cpp
   Stages/ExecuteStage.cpp
+  Stages/InOrderIssueStage.cpp
   Stages/InstructionTables.cpp
   Stages/MicroOpQueueStage.cpp
   Stages/RetireStage.cpp
diff --git a/llvm/lib/MCA/Context.cpp b/llvm/lib/MCA/Context.cpp
--- a/llvm/lib/MCA/Context.cpp
+++ b/llvm/lib/MCA/Context.cpp
@@ -21,6 +21,7 @@
 #include "llvm/MCA/Stages/DispatchStage.h"
 #include "llvm/MCA/Stages/EntryStage.h"
 #include "llvm/MCA/Stages/ExecuteStage.h"
+#include "llvm/MCA/Stages/InOrderIssueStage.h"
 #include "llvm/MCA/Stages/MicroOpQueueStage.h"
 #include "llvm/MCA/Stages/RetireStage.h"
 
@@ -31,6 +32,9 @@
 Context::createDefaultPipeline(const PipelineOptions &Opts, SourceMgr &SrcMgr) {
   const MCSchedModel &SM = STI.getSchedModel();
 
+  if (!SM.isOutOfOrder())
+    return createInOrderPipeline(Opts, SrcMgr);
+
   // Create the hardware units defining the backend.
   auto RCU = std::make_unique<RetireControlUnit>(SM);
   auto PRF = std::make_unique<RegisterFile>(SM, MRI, Opts.RegisterFileSize);
@@ -64,5 +68,29 @@
   return StagePipeline;
 }
 
+std::unique_ptr<Pipeline>
+Context::createInOrderPipeline(const PipelineOptions &Opts, SourceMgr &SrcMgr) {
+  const MCSchedModel &SM = STI.getSchedModel();
+  auto RCU = std::make_unique<RetireControlUnit>(SM);
+  auto PRF = std::make_unique<RegisterFile>(SM, MRI, Opts.RegisterFileSize);
+  auto LSU = std::make_unique<LSUnit>(SM, Opts.LoadQueueSize,
+                                      Opts.StoreQueueSize, Opts.AssumeNoAlias);
+
+  auto Entry = std::make_unique<EntryStage>(SrcMgr);
+  auto InOrderIssue = std::make_unique<InOrderIssueStage>(*RCU, *PRF, SM, STI);
+  auto Retire = std::make_unique<RetireStage>(*RCU, *PRF, *LSU);
+
+  auto StagePipeline = std::make_unique<Pipeline>();
+  StagePipeline->appendStage(std::move(Entry));
+  StagePipeline->appendStage(std::move(InOrderIssue));
+  StagePipeline->appendStage(std::move(Retire));
+
+  addHardwareUnit(std::move(RCU));
+  addHardwareUnit(std::move(PRF));
+  addHardwareUnit(std::move(LSU));
+
+  return StagePipeline;
+}
+
 } // namespace mca
 } // namespace llvm
diff --git a/llvm/lib/MCA/HardwareUnits/RetireControlUnit.cpp b/llvm/lib/MCA/HardwareUnits/RetireControlUnit.cpp
--- a/llvm/lib/MCA/HardwareUnits/RetireControlUnit.cpp
+++ b/llvm/lib/MCA/HardwareUnits/RetireControlUnit.cpp
@@ -33,12 +33,18 @@
     MaxRetirePerCycle = EPI.MaxRetirePerCycle;
   }
   NumROBEntries = AvailableEntries;
+  bool IsOutOfOrder = SM.MicroOpBufferSize;
+  if (!IsOutOfOrder && !NumROBEntries)
+    return;
   assert(NumROBEntries && "Invalid reorder buffer size!");
   Queue.resize(2 * NumROBEntries);
 }
 
 // Reserves a number of slots, and returns a new token.
 unsigned RetireControlUnit::dispatch(const InstRef &IR) {
+  if (!NumROBEntries)
+    return UnhandledTokenID;
+
   const Instruction &Inst = *IR.getInstruction();
   unsigned Entries = normalizeQuantity(Inst.getNumMicroOps());
   assert((AvailableEntries >= Entries) && "Reorder Buffer unavailable!");
@@ -47,6 +53,7 @@
   Queue[NextAvailableSlotIdx] = {IR, Entries, false};
   NextAvailableSlotIdx += std::max(1U, Entries);
   NextAvailableSlotIdx %= Queue.size();
+  assert(TokenID < UnhandledTokenID && "Invalid token ID");
 
   AvailableEntries -= Entries;
   return TokenID;
diff --git a/llvm/lib/MCA/InstrBuilder.cpp b/llvm/lib/MCA/InstrBuilder.cpp
--- a/llvm/lib/MCA/InstrBuilder.cpp
+++ b/llvm/lib/MCA/InstrBuilder.cpp
@@ -570,6 +570,7 @@
   ID->HasSideEffects = MCDesc.hasUnmodeledSideEffects();
   ID->BeginGroup = SCDesc.BeginGroup;
   ID->EndGroup = SCDesc.EndGroup;
+  ID->RetireOOO = SCDesc.RetireOOO;
 
   initializeUsedResources(*ID, SCDesc, STI, ProcResourceMasks);
   computeMaxLatency(*ID, MCDesc, SCDesc, STI);
diff --git a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp
@@ -0,0 +1,292 @@
+//===---------------------- InOrderIssueStage.cpp ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// InOrderIssueStage implements an in-order execution pipeline.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MCA/Stages/InOrderIssueStage.h"
+
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/MCA/HWEventListener.h"
+#include "llvm/MCA/HardwareUnits/RegisterFile.h"
+#include "llvm/MCA/HardwareUnits/ResourceManager.h"
+#include "llvm/MCA/HardwareUnits/RetireControlUnit.h"
+#include "llvm/MCA/Instruction.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+
+#include <algorithm>
+
+#define DEBUG_TYPE "llvm-mca"
+namespace llvm {
+namespace mca {
+
+bool InOrderIssueStage::hasWorkToComplete() const {
+  return !IssuedInst.empty() || StalledInst;
+}
+
+bool InOrderIssueStage::isAvailable(const InstRef &IR) const {
+  const Instruction &Inst = *IR.getInstruction();
+  unsigned NumMicroOps = Inst.getNumMicroOps();
+  const InstrDesc &Desc = Inst.getDesc();
+
+  if (Bandwidth < NumMicroOps)
+    return false;
+
+  // Instruction with BeginGroup must be the first instruction to be issued in a
+  // cycle.
+  if (Desc.BeginGroup && NumIssued != 0)
+    return false;
+
+  return true;
+}
+
+static bool hasResourceHazard(const ResourceManager &RM, const InstRef &IR) {
+  if (RM.checkAvailability(IR.getInstruction()->getDesc())) {
+    LLVM_DEBUG(dbgs() << "[E] Stall #" << IR << '\n');
+    return true;
+  }
+
+  return false;
+}
+
+/// Return a number of cycles left until register requirements of the
+/// instructions are met.
+static unsigned checkRegisterHazard(const RegisterFile &PRF,
+                                    const MCSchedModel &SM,
+                                    const MCSubtargetInfo &STI,
+                                    const InstRef &IR) {
+  unsigned StallCycles = 0;
+  SmallVector<WriteRef, 4> Writes;
+
+  for (const ReadState &RS : IR.getInstruction()->getUses()) {
+    const ReadDescriptor &RD = RS.getDescriptor();
+    const MCSchedClassDesc *SC = SM.getSchedClassDesc(RD.SchedClassID);
+
+    PRF.collectWrites(RS, Writes);
+    for (const WriteRef &WR : Writes) {
+      const WriteState *WS = WR.getWriteState();
+      unsigned WriteResID = WS->getWriteResourceID();
+      int ReadAdvance = STI.getReadAdvanceCycles(SC, RD.UseIndex, WriteResID);
+      LLVM_DEBUG(dbgs() << "[E] ReadAdvance for #" << IR << ": " << ReadAdvance
+                        << '\n');
+
+      if (WS->getCyclesLeft() == UNKNOWN_CYCLES) {
+        // Try again in the next cycle until the value is known
+        StallCycles = std::max(StallCycles, 1U);
+        continue;
+      }
+
+      int CyclesLeft = WS->getCyclesLeft() - ReadAdvance;
+      if (CyclesLeft > 0) {
+        LLVM_DEBUG(dbgs() << "[E] Register hazard: " << WS->getRegisterID()
+                          << '\n');
+        StallCycles = std::max(StallCycles, (unsigned)CyclesLeft);
+      }
+    }
+    Writes.clear();
+  }
+
+  return StallCycles;
+}
+
+bool InOrderIssueStage::canExecute(const InstRef &IR,
+                                   unsigned *StallCycles) const {
+  *StallCycles = 0;
+
+  if (unsigned RegStall = checkRegisterHazard(PRF, SM, STI, IR)) {
+    *StallCycles = RegStall;
+    // FIXME: add a parameter to HWStallEvent to indicate a number of cycles.
+    for (unsigned I = 0; I < RegStall; ++I) {
+      notifyEvent<HWStallEvent>(
+          HWStallEvent(HWStallEvent::RegisterFileStall, IR));
+      notifyEvent<HWPressureEvent>(
+          HWPressureEvent(HWPressureEvent::REGISTER_DEPS, IR));
+    }
+  } else if (hasResourceHazard(*RM, IR)) {
+    *StallCycles = 1;
+    notifyEvent<HWStallEvent>(
+        HWStallEvent(HWStallEvent::DispatchGroupStall, IR));
+    notifyEvent<HWPressureEvent>(
+        HWPressureEvent(HWPressureEvent::RESOURCES, IR));
+  }
+
+  return *StallCycles == 0;
+}
+
+static void addRegisterReadWrite(RegisterFile &PRF, Instruction &IS,
+                                 unsigned SourceIndex,
+                                 const MCSubtargetInfo &STI,
+                                 SmallVectorImpl<unsigned> &UsedRegs) {
+  assert(!IS.isEliminated());
+
+  for (ReadState &RS : IS.getUses())
+    PRF.addRegisterRead(RS, STI);
+
+  for (WriteState &WS : IS.getDefs())
+    PRF.addRegisterWrite(WriteRef(SourceIndex, &WS), UsedRegs);
+}
+
+static void notifyInstructionExecute(
+    const InstRef &IR,
+    const SmallVectorImpl<std::pair<ResourceRef, ResourceCycles>> &UsedRes,
+    const Stage &S) {
+
+  S.notifyEvent<HWInstructionEvent>(
+      HWInstructionEvent(HWInstructionEvent::Ready, IR));
+  S.notifyEvent<HWInstructionEvent>(HWInstructionIssuedEvent(IR, UsedRes));
+
+  LLVM_DEBUG(dbgs() << "[E] Issued #" << IR << "\n");
+}
+
+static void notifyInstructionDispatch(const InstRef &IR, unsigned Ops,
+                                      const SmallVectorImpl<unsigned> &UsedRegs,
+                                      const Stage &S) {
+
+  S.notifyEvent<HWInstructionEvent>(
+      HWInstructionDispatchedEvent(IR, UsedRegs, Ops));
+
+  LLVM_DEBUG(dbgs() << "[E] Dispatched #" << IR << "\n");
+}
+
+llvm::Error InOrderIssueStage::execute(InstRef &IR) {
+  Instruction &IS = *IR.getInstruction();
+  const InstrDesc &Desc = IS.getDesc();
+
+  unsigned RCUTokenID = RetireControlUnit::UnhandledTokenID;
+  if (!Desc.RetireOOO)
+    RCUTokenID = RCU.dispatch(IR);
+  IS.dispatch(RCUTokenID);
+
+  if (Desc.EndGroup) {
+    Bandwidth = 0;
+  } else {
+    unsigned NumMicroOps = IR.getInstruction()->getNumMicroOps();
+    assert(Bandwidth >= NumMicroOps);
+    Bandwidth -= NumMicroOps;
+  }
+
+  if (llvm::Error E = tryIssue(IR, &StallCyclesLeft))
+    return E;
+
+  if (StallCyclesLeft) {
+    StalledInst = IR;
+    Bandwidth = 0;
+  }
+
+  return llvm::ErrorSuccess();
+}
+
+llvm::Error InOrderIssueStage::tryIssue(InstRef &IR, unsigned *StallCycles) {
+  Instruction &IS = *IR.getInstruction();
+  unsigned SourceIndex = IR.getSourceIndex();
+
+  if (!canExecute(IR, StallCycles)) {
+    LLVM_DEBUG(dbgs() << "[E] Stalled #" << IR << " for " << *StallCycles
+                      << " cycles\n");
+    return llvm::ErrorSuccess();
+  }
+
+  SmallVector<unsigned, 4> UsedRegs(PRF.getNumRegisterFiles());
+  addRegisterReadWrite(PRF, IS, SourceIndex, STI, UsedRegs);
+
+  notifyInstructionDispatch(IR, IS.getDesc().NumMicroOps, UsedRegs, *this);
+
+  SmallVector<std::pair<ResourceRef, ResourceCycles>, 4> UsedResources;
+  RM->issueInstruction(IS.getDesc(), UsedResources);
+  IS.execute(SourceIndex);
+
+  // Replace resource masks with valid resource processor IDs.
+  for (std::pair<ResourceRef, ResourceCycles> &Use : UsedResources) {
+    uint64_t Mask = Use.first.first;
+    Use.first.first = RM->resolveResourceMask(Mask);
+  }
+  notifyInstructionExecute(IR, UsedResources, *this);
+
+  IssuedInst.push_back(IR);
+  ++NumIssued;
+
+  return llvm::ErrorSuccess();
+}
+
+llvm::Error InOrderIssueStage::updateIssuedInst() {
+  // Update other instructions. Executed instructions will be retired during the
+  // next cycle.
+  unsigned NumExecuted = 0;
+  for (auto I = IssuedInst.begin(), E = IssuedInst.end();
+       I != (E - NumExecuted);) {
+    InstRef &IR = *I;
+    Instruction &IS = *IR.getInstruction();
+
+    IS.cycleEvent();
+    if (!IS.isExecuted()) {
+      LLVM_DEBUG(dbgs() << "[E] Instruction #" << IR
+                        << " is still executing\n");
+      ++I;
+      continue;
+    }
+    notifyEvent<HWInstructionEvent>(
+        HWInstructionEvent(HWInstructionEvent::Executed, IR));
+
+    LLVM_DEBUG(dbgs() << "[E] Instruction #" << IR << " is executed\n");
+    ++NumExecuted;
+    std::iter_swap(I, E - NumExecuted);
+  }
+
+  // Retire instructions in the next cycle
+  if (NumExecuted) {
+    for (auto I = IssuedInst.end() - NumExecuted, E = IssuedInst.end(); I != E;
+         ++I) {
+      if (llvm::Error E = moveToTheNextStage(*I))
+        return E;
+    }
+    IssuedInst.resize(IssuedInst.size() - NumExecuted);
+  }
+
+  return llvm::ErrorSuccess();
+}
+
+llvm::Error InOrderIssueStage::cycleStart() {
+  NumIssued = 0;
+
+  // Release consumed resources.
+  SmallVector<ResourceRef, 4> Freed;
+  RM->cycleEvent(Freed);
+
+  if (llvm::Error E = updateIssuedInst())
+    return E;
+
+  // Issue instructions scheduled for this cycle
+  if (!StallCyclesLeft && StalledInst) {
+    if (llvm::Error E = tryIssue(StalledInst, &StallCyclesLeft))
+      return E;
+  }
+
+  if (!StallCyclesLeft) {
+    StalledInst.invalidate();
+    assert(NumIssued <= SM.IssueWidth && "Overflow.");
+    Bandwidth = SM.IssueWidth - NumIssued;
+  } else {
+    // The instruction is still stalled, cannot issue any new instructions in
+    // this cycle.
+    Bandwidth = 0;
+  }
+
+  return llvm::ErrorSuccess();
+}
+
+llvm::Error InOrderIssueStage::cycleEnd() {
+  if (StallCyclesLeft > 0)
+    --StallCyclesLeft;
+  return llvm::ErrorSuccess();
+}
+
+} // namespace mca
+} // namespace llvm
diff --git a/llvm/lib/MCA/Stages/RetireStage.cpp b/llvm/lib/MCA/Stages/RetireStage.cpp
--- a/llvm/lib/MCA/Stages/RetireStage.cpp
+++ b/llvm/lib/MCA/Stages/RetireStage.cpp
@@ -23,9 +23,6 @@
 namespace mca {
 
 llvm::Error RetireStage::cycleStart() {
-  if (RCU.isEmpty())
-    return llvm::ErrorSuccess();
-
   const unsigned MaxRetirePerCycle = RCU.getMaxRetirePerCycle();
   unsigned NumRetired = 0;
   while (!RCU.isEmpty()) {
@@ -39,11 +36,26 @@
     NumRetired++;
   }
 
+  // Retire instructions that are not controlled by the RCU
+  for (InstRef &IR : RetireInst) {
+    IR.getInstruction()->retire();
+    notifyInstructionRetired(IR);
+  }
+  RetireInst.resize(0);
+
   return llvm::ErrorSuccess();
 }
 
 llvm::Error RetireStage::execute(InstRef &IR) {
-  RCU.onInstructionExecuted(IR.getInstruction()->getRCUTokenID());
+  Instruction &IS = *IR.getInstruction();
+
+  unsigned TokenID = IS.getRCUTokenID();
+  if (TokenID != RetireControlUnit::UnhandledTokenID) {
+    RCU.onInstructionExecuted(TokenID);
+    return llvm::ErrorSuccess();
+  }
+
+  RetireInst.push_back(IR);
   return llvm::ErrorSuccess();
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA55.td b/llvm/lib/Target/AArch64/AArch64SchedA55.td
--- a/llvm/lib/Target/AArch64/AArch64SchedA55.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA55.td
@@ -151,6 +151,8 @@
 
 // FP Mul, Div, Sqrt. Div/Sqrt are not pipelined
 def : WriteRes<WriteFMul, [CortexA55UnitFPMAC]> { let Latency = 4; }
+
+let RetireOOO = 1 in {
 def : WriteRes<WriteFDiv, [CortexA55UnitFPDIV]> { let Latency = 22;
                                             let ResourceCycles = [29]; }
 def CortexA55WriteFMAC : SchedWriteRes<[CortexA55UnitFPMAC]> { let Latency = 4; }
@@ -166,7 +168,7 @@
                                                       let ResourceCycles = [9]; }
 def CortexA55WriteFSqrtDP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 22;
                                                       let ResourceCycles = [19]; }
-
+}
 //===----------------------------------------------------------------------===//
 // Subtarget-specific SchedRead types.
 
@@ -336,4 +338,6 @@
 def : InstRW<[CortexA55WriteFSqrtHP], (instregex "^.*SQRT.*16$")>;
 def : InstRW<[CortexA55WriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
 def : InstRW<[CortexA55WriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
+
+def A55RCU : RetireControlUnit<64, 0>;
 }
diff --git a/llvm/test/TableGen/InvalidMCSchedClassDesc.td b/llvm/test/TableGen/InvalidMCSchedClassDesc.td
--- a/llvm/test/TableGen/InvalidMCSchedClassDesc.td
+++ b/llvm/test/TableGen/InvalidMCSchedClassDesc.td
@@ -19,7 +19,7 @@
 // Inst_B didn't have the resoures, and it is invalid.
 // CHECK: SchedModel_ASchedClasses[] = {
 // CHECK: {DBGFIELD("Inst_A")             1
-// CHECK-NEXT: {DBGFIELD("Inst_B")             16383 
+// CHECK-NEXT: {DBGFIELD("Inst_B")             8191
 let SchedModel = SchedModel_A in {
   def Write_A : SchedWriteRes<[]>;
   def : InstRW<[Write_A], (instrs Inst_A)>;
@@ -27,7 +27,7 @@
 
 // Inst_A didn't have the resoures, and it is invalid.
 // CHECK: SchedModel_BSchedClasses[] = {
-// CHECK: {DBGFIELD("Inst_A")             16383 
+// CHECK: {DBGFIELD("Inst_A")             8191
 // CHECK-NEXT: {DBGFIELD("Inst_B")             1 
 let SchedModel = SchedModel_B in {
   def Write_B: SchedWriteRes<[]>; 
diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-add-sequence.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-add-sequence.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-add-sequence.s
@@ -0,0 +1,81 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --timeline --iterations=2 < %s | FileCheck %s
+
+add      w2, w3, #1
+add      w4, w3, #2, lsl #12
+add      w0, w4, #3
+add      w1, w0, #4
+
+# CHECK:      Iterations:        2
+# CHECK-NEXT: Instructions:      8
+# CHECK-NEXT: Total Cycles:      10
+# CHECK-NEXT: Total uOps:        8
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.80
+# CHECK-NEXT: IPC:               0.80
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     0.50                        add	w2, w3, #1
+# CHECK-NEXT:  1      3     0.50                        add	w4, w3, #2, lsl #12
+# CHECK-NEXT:  1      3     0.50                        add	w0, w4, #3
+# CHECK-NEXT:  1      3     0.50                        add	w1, w0, #4
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - CortexA55UnitALU
+# CHECK-NEXT: [0.1] - CortexA55UnitALU
+# CHECK-NEXT: [1]   - CortexA55UnitB
+# CHECK-NEXT: [2]   - CortexA55UnitDiv
+# CHECK-NEXT: [3.0] - CortexA55UnitFPALU
+# CHECK-NEXT: [3.1] - CortexA55UnitFPALU
+# CHECK-NEXT: [4]   - CortexA55UnitFPDIV
+# CHECK-NEXT: [5.0] - CortexA55UnitFPMAC
+# CHECK-NEXT: [5.1] - CortexA55UnitFPMAC
+# CHECK-NEXT: [6]   - CortexA55UnitLd
+# CHECK-NEXT: [7]   - CortexA55UnitMAC
+# CHECK-NEXT: [8]   - CortexA55UnitSt
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3.0]  [3.1]  [4]    [5.0]  [5.1]  [6]    [7]    [8]
+# CHECK-NEXT: 2.00   2.00    -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3.0]  [3.1]  [4]    [5.0]  [5.1]  [6]    [7]    [8]    Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -     add	w2, w3, #1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -     add	w4, w3, #2, lsl #12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -     add	w0, w4, #3
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -     add	w1, w0, #4
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.   .   add	w2, w3, #1
+# CHECK-NEXT: [0,1]     DeeER.   .   add	w4, w3, #2, lsl #12
+# CHECK-NEXT: [0,2]     .DeeER   .   add	w0, w4, #3
+# CHECK-NEXT: [0,3]     . DeeER  .   add	w1, w0, #4
+# CHECK-NEXT: [1,0]     . DeeER  .   add	w2, w3, #1
+# CHECK-NEXT: [1,1]     .  DeeER .   add	w4, w3, #2, lsl #12
+# CHECK-NEXT: [1,2]     .   DeeER.   add	w0, w4, #3
+# CHECK-NEXT: [1,3]     .    DeeER   add	w1, w0, #4
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     0.0    0.0    0.0       add	w2, w3, #1
+# CHECK-NEXT: 1.     2     0.0    0.0    0.0       add	w4, w3, #2, lsl #12
+# CHECK-NEXT: 2.     2     0.0    0.0    0.0       add	w0, w4, #3
+# CHECK-NEXT: 3.     2     0.0    0.0    0.0       add	w1, w0, #4
+# CHECK-NEXT:        2     0.0    0.0    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s
@@ -0,0 +1,100 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-stats --iterations=2 < %s | FileCheck %s
+
+ldr	w4, [x2], #4
+ldr	w5, [x3]
+madd	w0, w5, w4, w0
+add	x3, x3, x13
+subs	x1, x1, #1
+str	w0, [x21, x18, lsl #2]
+
+# CHECK:      Iterations:        2
+# CHECK-NEXT: Instructions:      12
+# CHECK-NEXT: Total Cycles:      21
+# CHECK-NEXT: Total uOps:        14
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.67
+# CHECK-NEXT: IPC:               0.57
+# CHECK-NEXT: Block RThroughput: 3.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      3     1.00    *                   ldr	w4, [x2], #4
+# CHECK-NEXT:  1      3     1.00    *                   ldr	w5, [x3]
+# CHECK-NEXT:  1      4     1.00                        madd	w0, w5, w4, w0
+# CHECK-NEXT:  1      3     0.50                        add	x3, x3, x13
+# CHECK-NEXT:  1      3     0.50                        subs	x1, x1, #1
+# CHECK-NEXT:  1      4     1.00           *            str	w0, [x21, x18, lsl #2]
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      10  (47.6%)
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              11  (52.4%)
+# CHECK-NEXT:  1,              6  (28.6%)
+# CHECK-NEXT:  2,              4  (19.0%)
+
+# CHECK:      Schedulers - number of cycles where we saw N micro opcodes issued:
+# CHECK-NEXT: [# issued], [# cycles]
+# CHECK-NEXT:  0,          11  (52.4%)
+# CHECK-NEXT:  1,          6  (28.6%)
+# CHECK-NEXT:  2,          4  (19.0%)
+
+# CHECK:      Scheduler's queue usage:
+# CHECK-NEXT: No scheduler resources used.
+
+# CHECK:      Retire Control Unit - number of cycles where we saw N instructions retired:
+# CHECK-NEXT: [# retired], [# cycles]
+# CHECK-NEXT:  0,           14  (66.7%)
+# CHECK-NEXT:  1,           4  (19.0%)
+# CHECK-NEXT:  2,           1  (4.8%)
+# CHECK-NEXT:  3,           2  (9.5%)
+
+# CHECK:      Total ROB Entries:                64
+# CHECK-NEXT: Max Used ROB Entries:             6  ( 9.4% )
+# CHECK-NEXT: Average Used ROB Entries per cy:  2  ( 3.1% )
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    14
+# CHECK-NEXT: Max number of mappings used:         6
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - CortexA55UnitALU
+# CHECK-NEXT: [0.1] - CortexA55UnitALU
+# CHECK-NEXT: [1]   - CortexA55UnitB
+# CHECK-NEXT: [2]   - CortexA55UnitDiv
+# CHECK-NEXT: [3.0] - CortexA55UnitFPALU
+# CHECK-NEXT: [3.1] - CortexA55UnitFPALU
+# CHECK-NEXT: [4]   - CortexA55UnitFPDIV
+# CHECK-NEXT: [5.0] - CortexA55UnitFPMAC
+# CHECK-NEXT: [5.1] - CortexA55UnitFPMAC
+# CHECK-NEXT: [6]   - CortexA55UnitLd
+# CHECK-NEXT: [7]   - CortexA55UnitMAC
+# CHECK-NEXT: [8]   - CortexA55UnitSt
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3.0]  [3.1]  [4]    [5.0]  [5.1]  [6]    [7]    [8]
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -     2.00   1.00   1.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3.0]  [3.1]  [4]    [5.0]  [5.1]  [6]    [7]    [8]    Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     ldr	w4, [x2], #4
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     ldr	w5, [x3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -     madd	w0, w5, w4, w0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -     add	x3, x3, x13
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -     subs	x1, x1, #1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00   str	w0, [x21, x18, lsl #2]
diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s
@@ -0,0 +1,132 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views --iterations=2 < %s | FileCheck %s
+
+ldr	w4, [x2], #4
+ldr	w5, [x3]
+madd	w0, w5, w4, w0
+add	x3, x3, x13
+subs	x1, x1, #1
+str	w0, [x21, x18, lsl #2]
+
+# CHECK:      Iterations:        2
+# CHECK-NEXT: Instructions:      12
+# CHECK-NEXT: Total Cycles:      21
+# CHECK-NEXT: Total uOps:        14
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.67
+# CHECK-NEXT: IPC:               0.57
+# CHECK-NEXT: Block RThroughput: 3.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      3     1.00    *                   ldr	w4, [x2], #4
+# CHECK-NEXT:  1      3     1.00    *                   ldr	w5, [x3]
+# CHECK-NEXT:  1      4     1.00                        madd	w0, w5, w4, w0
+# CHECK-NEXT:  1      3     0.50                        add	x3, x3, x13
+# CHECK-NEXT:  1      3     0.50                        subs	x1, x1, #1
+# CHECK-NEXT:  1      4     1.00           *            str	w0, [x21, x18, lsl #2]
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      10  (47.6%)
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              11  (52.4%)
+# CHECK-NEXT:  1,              6  (28.6%)
+# CHECK-NEXT:  2,              4  (19.0%)
+
+# CHECK:      Schedulers - number of cycles where we saw N micro opcodes issued:
+# CHECK-NEXT: [# issued], [# cycles]
+# CHECK-NEXT:  0,          11  (52.4%)
+# CHECK-NEXT:  1,          6  (28.6%)
+# CHECK-NEXT:  2,          4  (19.0%)
+
+# CHECK:      Scheduler's queue usage:
+# CHECK-NEXT: No scheduler resources used.
+
+# CHECK:      Retire Control Unit - number of cycles where we saw N instructions retired:
+# CHECK-NEXT: [# retired], [# cycles]
+# CHECK-NEXT:  0,           14  (66.7%)
+# CHECK-NEXT:  1,           4  (19.0%)
+# CHECK-NEXT:  2,           1  (4.8%)
+# CHECK-NEXT:  3,           2  (9.5%)
+
+# CHECK:      Total ROB Entries:                64
+# CHECK-NEXT: Max Used ROB Entries:             6  ( 9.4% )
+# CHECK-NEXT: Average Used ROB Entries per cy:  2  ( 3.1% )
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    14
+# CHECK-NEXT: Max number of mappings used:         6
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - CortexA55UnitALU
+# CHECK-NEXT: [0.1] - CortexA55UnitALU
+# CHECK-NEXT: [1]   - CortexA55UnitB
+# CHECK-NEXT: [2]   - CortexA55UnitDiv
+# CHECK-NEXT: [3.0] - CortexA55UnitFPALU
+# CHECK-NEXT: [3.1] - CortexA55UnitFPALU
+# CHECK-NEXT: [4]   - CortexA55UnitFPDIV
+# CHECK-NEXT: [5.0] - CortexA55UnitFPMAC
+# CHECK-NEXT: [5.1] - CortexA55UnitFPMAC
+# CHECK-NEXT: [6]   - CortexA55UnitLd
+# CHECK-NEXT: [7]   - CortexA55UnitMAC
+# CHECK-NEXT: [8]   - CortexA55UnitSt
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3.0]  [3.1]  [4]    [5.0]  [5.1]  [6]    [7]    [8]
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -     2.00   1.00   1.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3.0]  [3.1]  [4]    [5.0]  [5.1]  [6]    [7]    [8]    Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     ldr	w4, [x2], #4
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     ldr	w5, [x3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -     madd	w0, w5, w4, w0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -     add	x3, x3, x13
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -     subs	x1, x1, #1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00   str	w0, [x21, x18, lsl #2]
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeER.    .    .    .   ldr	w4, [x2], #4
+# CHECK-NEXT: [0,1]     .DeeER    .    .    .   ldr	w5, [x3]
+# CHECK-NEXT: [0,2]     .   DeeeER.    .    .   madd	w0, w5, w4, w0
+# CHECK-NEXT: [0,3]     .   DeeE-R.    .    .   add	x3, x3, x13
+# CHECK-NEXT: [0,4]     .    DeeER.    .    .   subs	x1, x1, #1
+# CHECK-NEXT: [0,5]     .    . DeeeER  .    .   str	w0, [x21, x18, lsl #2]
+# CHECK-NEXT: [1,0]     .    .  DeeER  .    .   ldr	w4, [x2], #4
+# CHECK-NEXT: [1,1]     .    .   DeeER .    .   ldr	w5, [x3]
+# CHECK-NEXT: [1,2]     .    .    . DeeeER  .   madd	w0, w5, w4, w0
+# CHECK-NEXT: [1,3]     .    .    . DeeE-R  .   add	x3, x3, x13
+# CHECK-NEXT: [1,4]     .    .    .  DeeER  .   subs	x1, x1, #1
+# CHECK-NEXT: [1,5]     .    .    .    DeeeER   str	w0, [x21, x18, lsl #2]
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     0.0    0.0    0.0       ldr	w4, [x2], #4
+# CHECK-NEXT: 1.     2     0.0    0.0    0.0       ldr	w5, [x3]
+# CHECK-NEXT: 2.     2     0.0    0.0    0.0       madd	w0, w5, w4, w0
+# CHECK-NEXT: 3.     2     0.0    0.0    1.0       add	x3, x3, x13
+# CHECK-NEXT: 4.     2     0.0    0.0    0.0       subs	x1, x1, #1
+# CHECK-NEXT: 5.     2     0.0    0.0    0.0       str	w0, [x21, x18, lsl #2]
+# CHECK-NEXT:        2     0.0    0.0    0.2       <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-in-order-retire.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-in-order-retire.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-in-order-retire.s
@@ -0,0 +1,128 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-stats --all-views --iterations=2 < %s | FileCheck %s
+
+sdiv	w12, w21, w0
+add	w8, w8, #1
+add	w1, w2, w0
+add	w3, w4, #1
+add	w5, w6, w0
+add	w7, w9, w0
+
+# CHECK:      Iterations:        2
+# CHECK-NEXT: Instructions:      12
+# CHECK-NEXT: Total Cycles:      18
+# CHECK-NEXT: Total uOps:        12
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.67
+# CHECK-NEXT: IPC:               0.67
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      8     8.00                        sdiv	w12, w21, w0
+# CHECK-NEXT:  1      3     0.50                        add	w8, w8, #1
+# CHECK-NEXT:  1      3     0.50                        add	w1, w2, w0
+# CHECK-NEXT:  1      3     0.50                        add	w3, w4, #1
+# CHECK-NEXT:  1      3     0.50                        add	w5, w6, w0
+# CHECK-NEXT:  1      3     0.50                        add	w7, w9, w0
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      0
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 5  (27.8%)
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              12  (66.7%)
+# CHECK-NEXT:  2,              6  (33.3%)
+
+# CHECK:      Schedulers - number of cycles where we saw N micro opcodes issued:
+# CHECK-NEXT: [# issued], [# cycles]
+# CHECK-NEXT:  0,          12  (66.7%)
+# CHECK-NEXT:  2,          6  (33.3%)
+
+# CHECK:      Scheduler's queue usage:
+# CHECK-NEXT: No scheduler resources used.
+
+# CHECK:      Retire Control Unit - number of cycles where we saw N instructions retired:
+# CHECK-NEXT: [# retired], [# cycles]
+# CHECK-NEXT:  0,           16  (88.9%)
+# CHECK-NEXT:  6,           2  (11.1%)
+
+# CHECK:      Total ROB Entries:                64
+# CHECK-NEXT: Max Used ROB Entries:             8  ( 12.5% )
+# CHECK-NEXT: Average Used ROB Entries per cy:  5  ( 7.8% )
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    12
+# CHECK-NEXT: Max number of mappings used:         8
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - CortexA55UnitALU
+# CHECK-NEXT: [0.1] - CortexA55UnitALU
+# CHECK-NEXT: [1]   - CortexA55UnitB
+# CHECK-NEXT: [2]   - CortexA55UnitDiv
+# CHECK-NEXT: [3.0] - CortexA55UnitFPALU
+# CHECK-NEXT: [3.1] - CortexA55UnitFPALU
+# CHECK-NEXT: [4]   - CortexA55UnitFPDIV
+# CHECK-NEXT: [5.0] - CortexA55UnitFPMAC
+# CHECK-NEXT: [5.1] - CortexA55UnitFPMAC
+# CHECK-NEXT: [6]   - CortexA55UnitLd
+# CHECK-NEXT: [7]   - CortexA55UnitMAC
+# CHECK-NEXT: [8]   - CortexA55UnitSt
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3.0]  [3.1]  [4]    [5.0]  [5.1]  [6]    [7]    [8]
+# CHECK-NEXT: 2.50   2.50    -     8.00    -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3.0]  [3.1]  [4]    [5.0]  [5.1]  [6]    [7]    [8]    Instructions:
+# CHECK-NEXT:  -      -      -     8.00    -      -      -      -      -      -      -      -     sdiv	w12, w21, w0
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     add	w8, w8, #1
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     add	w1, w2, w0
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     add	w3, w4, #1
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     add	w5, w6, w0
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     add	w7, w9, w0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01234567
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    . .   sdiv	w12, w21, w0
+# CHECK-NEXT: [0,1]     DeeE-----R.    . .   add	w8, w8, #1
+# CHECK-NEXT: [0,2]     .DeeE----R.    . .   add	w1, w2, w0
+# CHECK-NEXT: [0,3]     .DeeE----R.    . .   add	w3, w4, #1
+# CHECK-NEXT: [0,4]     . DeeE---R.    . .   add	w5, w6, w0
+# CHECK-NEXT: [0,5]     . DeeE---R.    . .   add	w7, w9, w0
+# CHECK-NEXT: [1,0]     .    .  DeeeeeeeER   sdiv	w12, w21, w0
+# CHECK-NEXT: [1,1]     .    .  DeeE-----R   add	w8, w8, #1
+# CHECK-NEXT: [1,2]     .    .   DeeE----R   add	w1, w2, w0
+# CHECK-NEXT: [1,3]     .    .   DeeE----R   add	w3, w4, #1
+# CHECK-NEXT: [1,4]     .    .    DeeE---R   add	w5, w6, w0
+# CHECK-NEXT: [1,5]     .    .    DeeE---R   add	w7, w9, w0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     0.0    0.0    0.0       sdiv	w12, w21, w0
+# CHECK-NEXT: 1.     2     0.0    0.0    5.0       add	w8, w8, #1
+# CHECK-NEXT: 2.     2     0.0    0.0    4.0       add	w1, w2, w0
+# CHECK-NEXT: 3.     2     0.0    0.0    4.0       add	w3, w4, #1
+# CHECK-NEXT: 4.     2     0.0    0.0    3.0       add	w5, w6, w0
+# CHECK-NEXT: 5.     2     0.0    0.0    3.0       add	w7, w9, w0
+# CHECK-NEXT:        2     0.0    0.0    3.2       <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-out-of-order-retire.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-out-of-order-retire.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-out-of-order-retire.s
@@ -0,0 +1,129 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-stats --all-views --iterations=2 < %s | FileCheck %s
+
+fdiv	s1, s2, s3
+add	w8, w8, #1
+add	w1, w2, w0
+add	w3, w4, #1
+add	w5, w6, w0
+add	w7, w9, w0
+
+# CHECK:      Iterations:        2
+# CHECK-NEXT: Instructions:      12
+# CHECK-NEXT: Total Cycles:      25
+# CHECK-NEXT: Total uOps:        12
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.48
+# CHECK-NEXT: IPC:               0.48
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      13    10.00                       fdiv	s1, s2, s3
+# CHECK-NEXT:  1      3     0.50                        add	w8, w8, #1
+# CHECK-NEXT:  1      3     0.50                        add	w1, w2, w0
+# CHECK-NEXT:  1      3     0.50                        add	w3, w4, #1
+# CHECK-NEXT:  1      3     0.50                        add	w5, w6, w0
+# CHECK-NEXT:  1      3     0.50                        add	w7, w9, w0
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      0
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 7  (28.0%)
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              19  (76.0%)
+# CHECK-NEXT:  2,              6  (24.0%)
+
+# CHECK:      Schedulers - number of cycles where we saw N micro opcodes issued:
+# CHECK-NEXT: [# issued], [# cycles]
+# CHECK-NEXT:  0,          19  (76.0%)
+# CHECK-NEXT:  2,          6  (24.0%)
+
+# CHECK:      Scheduler's queue usage:
+# CHECK-NEXT: No scheduler resources used.
+
+# CHECK:      Retire Control Unit - number of cycles where we saw N instructions retired:
+# CHECK-NEXT: [# retired], [# cycles]
+# CHECK-NEXT:  0,           18  (72.0%)
+# CHECK-NEXT:  1,           2  (8.0%)
+# CHECK-NEXT:  2,           5  (20.0%)
+
+# CHECK:      Total ROB Entries:                64
+# CHECK-NEXT: Max Used ROB Entries:             7  ( 10.9% )
+# CHECK-NEXT: Average Used ROB Entries per cy:  2  ( 3.1% )
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    12
+# CHECK-NEXT: Max number of mappings used:         7
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - CortexA55UnitALU
+# CHECK-NEXT: [0.1] - CortexA55UnitALU
+# CHECK-NEXT: [1]   - CortexA55UnitB
+# CHECK-NEXT: [2]   - CortexA55UnitDiv
+# CHECK-NEXT: [3.0] - CortexA55UnitFPALU
+# CHECK-NEXT: [3.1] - CortexA55UnitFPALU
+# CHECK-NEXT: [4]   - CortexA55UnitFPDIV
+# CHECK-NEXT: [5.0] - CortexA55UnitFPMAC
+# CHECK-NEXT: [5.1] - CortexA55UnitFPMAC
+# CHECK-NEXT: [6]   - CortexA55UnitLd
+# CHECK-NEXT: [7]   - CortexA55UnitMAC
+# CHECK-NEXT: [8]   - CortexA55UnitSt
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3.0]  [3.1]  [4]    [5.0]  [5.1]  [6]    [7]    [8]
+# CHECK-NEXT: 2.50   2.50    -      -      -      -     10.00   -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3.0]  [3.1]  [4]    [5.0]  [5.1]  [6]    [7]    [8]    Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -     10.00   -      -      -      -      -     fdiv	s1, s2, s3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     add	w8, w8, #1
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     add	w1, w2, w0
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     add	w3, w4, #1
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     add	w5, w6, w0
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     add	w7, w9, w0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234
+
+# CHECK:      [0,0]     DeeeeeeeeeeeeER.    .   .   fdiv	s1, s2, s3
+# CHECK-NEXT: [0,1]     DeeER.    .    .    .   .   add	w8, w8, #1
+# CHECK-NEXT: [0,2]     .DeeER    .    .    .   .   add	w1, w2, w0
+# CHECK-NEXT: [0,3]     .DeeER    .    .    .   .   add	w3, w4, #1
+# CHECK-NEXT: [0,4]     . DeeER   .    .    .   .   add	w5, w6, w0
+# CHECK-NEXT: [0,5]     . DeeER   .    .    .   .   add	w7, w9, w0
+# CHECK-NEXT: [1,0]     .    .    DeeeeeeeeeeeeER   fdiv	s1, s2, s3
+# CHECK-NEXT: [1,1]     .    .    DeeER.    .   .   add	w8, w8, #1
+# CHECK-NEXT: [1,2]     .    .    .DeeER    .   .   add	w1, w2, w0
+# CHECK-NEXT: [1,3]     .    .    .DeeER    .   .   add	w3, w4, #1
+# CHECK-NEXT: [1,4]     .    .    . DeeER   .   .   add	w5, w6, w0
+# CHECK-NEXT: [1,5]     .    .    . DeeER   .   .   add	w7, w9, w0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     0.0    0.0    0.0       fdiv	s1, s2, s3
+# CHECK-NEXT: 1.     2     0.0    0.0    0.0       add	w8, w8, #1
+# CHECK-NEXT: 2.     2     0.0    0.0    0.0       add	w1, w2, w0
+# CHECK-NEXT: 3.     2     0.0    0.0    0.0       add	w3, w4, #1
+# CHECK-NEXT: 4.     2     0.0    0.0    0.0       add	w5, w6, w0
+# CHECK-NEXT: 5.     2     0.0    0.0    0.0       add	w7, w9, w0
+# CHECK-NEXT:        2     0.0    0.0    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/in-order-bottleneck-analysis.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/in-order-bottleneck-analysis.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/in-order-bottleneck-analysis.s
@@ -0,0 +1,8 @@
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views < %s | FileCheck %s
+# CHECK-NOT: Throughput Bottlenecks
+
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --bottleneck-analysis < %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-WARN
+# CHECK-WARN: warning: bottleneck analysis is not supported for in-order CPU 'cortex-a55'
+
+add      w2, w3, #1
+
diff --git a/llvm/test/tools/llvm-mca/ARM/m7-negative-readadvance.s b/llvm/test/tools/llvm-mca/ARM/m7-negative-readadvance.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/ARM/m7-negative-readadvance.s
@@ -0,0 +1,75 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=arm -mcpu=cortex-m7 --timeline --iterations=1 < %s | FileCheck %s
+
+add r1, r1, #1
+# ReadAdvance: 0
+add r1, r1, #2
+# ReadAdvance: -1
+vldr d0, [r1]
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      3
+# CHECK-NEXT: Total Cycles:      7
+# CHECK-NEXT: Total uOps:        3
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.43
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.50                        add.w	r1, r1, #1
+# CHECK-NEXT:  1      1     0.50                        add.w	r1, r1, #2
+# CHECK-NEXT:  1      3     1.00    *                   vldr	d0, [r1]
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - M7UnitALU
+# CHECK-NEXT: [0.1] - M7UnitALU
+# CHECK-NEXT: [1]   - M7UnitBranch
+# CHECK-NEXT: [2.0] - M7UnitLoad
+# CHECK-NEXT: [2.1] - M7UnitLoad
+# CHECK-NEXT: [3]   - M7UnitMAC
+# CHECK-NEXT: [4]   - M7UnitSIMD
+# CHECK-NEXT: [5]   - M7UnitShift1
+# CHECK-NEXT: [6]   - M7UnitShift2
+# CHECK-NEXT: [7]   - M7UnitStore
+# CHECK-NEXT: [8]   - M7UnitVFP
+# CHECK-NEXT: [9.0] - M7UnitVPort
+# CHECK-NEXT: [9.1] - M7UnitVPort
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2.0]  [2.1]  [3]    [4]    [5]    [6]    [7]    [8]    [9.0]  [9.1]
+# CHECK-NEXT: 1.00   1.00    -      -     1.00    -      -      -      -      -      -      -     2.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2.0]  [2.1]  [3]    [4]    [5]    [6]    [7]    [8]    [9.0]  [9.1]  Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -     add.w	r1, r1, #1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     add.w	r1, r1, #2
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     2.00   vldr	d0, [r1]
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     0123456
+
+# CHECK:      [0,0]     DER  ..   add.w	r1, r1, #1
+# CHECK-NEXT: [0,1]     .DER ..   add.w	r1, r1, #2
+# CHECK-NEXT: [0,2]     .  DeER   vldr	d0, [r1]
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       add.w	r1, r1, #1
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add.w	r1, r1, #2
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       vldr	d0, [r1]
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/X86/in-order-cpu.s b/llvm/test/tools/llvm-mca/X86/in-order-cpu.s
--- a/llvm/test/tools/llvm-mca/X86/in-order-cpu.s
+++ b/llvm/test/tools/llvm-mca/X86/in-order-cpu.s
@@ -1,3 +1,3 @@
-# RUN: not llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=atom -o /dev/null 2>&1 | FileCheck %s
-
-# CHECK: error: please specify an out-of-order cpu. 'atom' is an in-order cpu.
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=atom -o /dev/null 2>&1 | FileCheck %s
+# CHECK: warning: support for in-order CPU 'atom' is experimental.
+movsbw	%al, %di
diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp
--- a/llvm/tools/llvm-mca/llvm-mca.cpp
+++ b/llvm/tools/llvm-mca/llvm-mca.cpp
@@ -257,14 +257,15 @@
     O = Default.getValue();
 }
 
-static void processViewOptions() {
+static void processViewOptions(bool IsOutOfOrder) {
   if (!EnableAllViews.getNumOccurrences() &&
       !EnableAllStats.getNumOccurrences())
     return;
 
   if (EnableAllViews.getNumOccurrences()) {
     processOptionImpl(PrintSummaryView, EnableAllViews);
-    processOptionImpl(EnableBottleneckAnalysis, EnableAllViews);
+    if (IsOutOfOrder)
+      processOptionImpl(EnableBottleneckAnalysis, EnableAllViews);
     processOptionImpl(PrintResourcePressureView, EnableAllViews);
     processOptionImpl(PrintTimelineView, EnableAllViews);
     processOptionImpl(PrintInstructionInfoView, EnableAllViews);
@@ -327,9 +328,6 @@
     return 1;
   }
 
-  // Apply overrides to llvm-mca specific options.
-  processViewOptions();
-
   if (MCPU == "native")
     MCPU = std::string(llvm::sys::getHostCPUName());
 
@@ -339,10 +337,10 @@
   if (!STI->isCPUStringValid(MCPU))
     return 1;
 
-  if (!PrintInstructionTables && !STI->getSchedModel().isOutOfOrder()) {
-    WithColor::error() << "please specify an out-of-order cpu. '" << MCPU
-                       << "' is an in-order cpu.\n";
-    return 1;
+  bool IsOutOfOrder = STI->getSchedModel().isOutOfOrder();
+  if (!PrintInstructionTables && !IsOutOfOrder) {
+    WithColor::warning() << "support for in-order CPU '" << MCPU
+                         << "' is experimental.\n";
   }
 
   if (!STI->getSchedModel().hasInstrSchedModel()) {
@@ -358,6 +356,9 @@
     return 1;
   }
 
+  // Apply overrides to llvm-mca specific options.
+  processViewOptions(IsOutOfOrder);
+
   std::unique_ptr<MCRegisterInfo> MRI(TheTarget->createMCRegInfo(TripleName));
   assert(MRI && "Unable to create target register info!");
 
@@ -539,6 +540,11 @@
           std::make_unique<mca::SummaryView>(SM, Insts, DispatchWidth));
 
     if (EnableBottleneckAnalysis) {
+      if (!IsOutOfOrder) {
+        WithColor::warning()
+            << "bottleneck analysis is not supported for in-order CPU '" << MCPU
+            << "'.\n";
+      }
       Printer.addView(std::make_unique<mca::BottleneckAnalysis>(
           *STI, *IP, Insts, S.getNumIterations()));
     }
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -993,6 +993,7 @@
     SCDesc.NumMicroOps = 0;
     SCDesc.BeginGroup = false;
     SCDesc.EndGroup = false;
+    SCDesc.RetireOOO = false;
     SCDesc.WriteProcResIdx = 0;
     SCDesc.WriteLatencyIdx = 0;
     SCDesc.ReadAdvanceIdx = 0;
@@ -1095,6 +1096,7 @@
         SCDesc.EndGroup |= WriteRes->getValueAsBit("EndGroup");
         SCDesc.BeginGroup |= WriteRes->getValueAsBit("SingleIssue");
         SCDesc.EndGroup |= WriteRes->getValueAsBit("SingleIssue");
+        SCDesc.RetireOOO |= WriteRes->getValueAsBit("RetireOOO");
 
         // Create an entry for each ProcResource listed in WriteRes.
         RecVec PRVec = WriteRes->getValueAsListOfDefs("ProcResources");
@@ -1293,7 +1295,7 @@
     std::vector<MCSchedClassDesc> &SCTab =
       SchedTables.ProcSchedClasses[1 + (PI - SchedModels.procModelBegin())];
 
-    OS << "\n// {Name, NumMicroOps, BeginGroup, EndGroup,"
+    OS << "\n// {Name, NumMicroOps, BeginGroup, EndGroup, RetireOOO,"
        << " WriteProcResIdx,#, WriteLatencyIdx,#, ReadAdvanceIdx,#}\n";
     OS << "static const llvm::MCSchedClassDesc "
        << PI->ModelName << "SchedClasses[] = {\n";
@@ -1304,7 +1306,7 @@
            && "invalid class not first");
     OS << "  {DBGFIELD(\"InvalidSchedClass\")  "
        << MCSchedClassDesc::InvalidNumMicroOps
-       << ", false, false,  0, 0,  0, 0,  0, 0},\n";
+       << ", false, false, false, 0, 0,  0, 0,  0, 0},\n";
 
     for (unsigned SCIdx = 1, SCEnd = SCTab.size(); SCIdx != SCEnd; ++SCIdx) {
       MCSchedClassDesc &MCDesc = SCTab[SCIdx];
@@ -1315,6 +1317,7 @@
       OS << MCDesc.NumMicroOps
          << ", " << ( MCDesc.BeginGroup ? "true" : "false" )
          << ", " << ( MCDesc.EndGroup ? "true" : "false" )
+         << ", " << ( MCDesc.RetireOOO ? "true" : "false" )
          << ", " << format("%2d", MCDesc.WriteProcResIdx)
          << ", " << MCDesc.NumWriteProcResEntries
          << ", " << format("%2d", MCDesc.WriteLatencyIdx)