diff --git a/llvm/docs/CommandGuide/llvm-mca.rst b/llvm/docs/CommandGuide/llvm-mca.rst
--- a/llvm/docs/CommandGuide/llvm-mca.rst
+++ b/llvm/docs/CommandGuide/llvm-mca.rst
@@ -212,6 +212,11 @@
   Print the requested views in JSON format. The instructions and the processor
   resources are printed as members of special top level JSON objects.  The
   individual views refer to them by index.
+  
+.. option:: -disable-cb
+
+  Force usage of the generic CustomBehaviour class rather than using the target
+  specific class. The generic class never detects any custom hazards.
 
 
 EXIT STATUS
@@ -978,3 +983,33 @@
 retire. :program:`llvm-mca` ensures that writes are committed in-order. However,
 an instruction is allowed to commit writes and retire out-of-order if
 ``RetireOOO`` property is true for at least one of its writes.
+
+Custom Behaviour
+""""""""""""""""""""""""""""""""""""
+Some instructions aren't modeled properly by :program:`llvm-mca`. This is
+usually due to the instruction not being expressed perfectly within the
+scheduling model. Modifying the scheduling models isn't always a viable
+option though (maybe because the instruction is modeled incorrectly on
+purpose or the instruction's behaviour is quite complex). This is where the
+CustomBehaviour class can be utilized.
+
+:program:`llvm-mca` comes with one generic and multiple target specific
+CustomBehaviour classes. The generic class will be used if the ``-disable-cb``
+flag is used or if a target specific CustomBehaviour class doesn't exist for
+that target. (The generic class does nothing.) Currently, the CustomBehaviour
+class is only a part of the in-order pipeline, but there are plans to add it
+to the out-of-order pipeline in the future.
+
+CustomBehaviour's main method is `checkCustomHazard()` which takes as input
+an `InstRef` (the instruction that the pipeline is currently trying to dispatch)
+as well as a list of `InstRef` (all of the instructions that are still
+executing within the pipeline). As output, the method returns an
+integer representing the number of cycles that the current instruction must
+stall for (this can be an underestimate if you don't know the exact number).
+Using these inputs, as well as the `MCSubtargetInfo` and `MCInstrInfo` that
+the base class has references to, developers should be able to detect and
+enforce dependencies that :program:`llvm-mca` didn't pickup by itself.
+
+If you'd like to add a CustomBehaviour class for a target that doesn't
+already have one, refer to one of the existing ones to see how to set it
+up. Remember to look at (and add to) `/llvm-mca/lib/CMakeLists.txt`.
diff --git a/llvm/include/llvm/MCA/Context.h b/llvm/include/llvm/MCA/Context.h
--- a/llvm/include/llvm/MCA/Context.h
+++ b/llvm/include/llvm/MCA/Context.h
@@ -19,6 +19,7 @@
 
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MCA/CustomBehaviour.h"
 #include "llvm/MCA/HardwareUnits/HardwareUnit.h"
 #include "llvm/MCA/Pipeline.h"
 #include "llvm/MCA/SourceMgr.h"
@@ -67,12 +68,14 @@
   /// Construct a basic pipeline for simulating an out-of-order pipeline.
   /// This pipeline consists of Fetch, Dispatch, Execute, and Retire stages.
   std::unique_ptr<Pipeline> createDefaultPipeline(const PipelineOptions &Opts,
-                                                  SourceMgr &SrcMgr);
+                                                  SourceMgr &SrcMgr,
+                                                  CustomBehaviour &CB);
 
   /// Construct a basic pipeline for simulating an in-order pipeline.
   /// This pipeline consists of Fetch, InOrderIssue, and Retire stages.
   std::unique_ptr<Pipeline> createInOrderPipeline(const PipelineOptions &Opts,
-                                                  SourceMgr &SrcMgr);
+                                                  SourceMgr &SrcMgr,
+                                                  CustomBehaviour &CB);
 };
 
 } // namespace mca
diff --git a/llvm/include/llvm/MCA/CustomBehaviour.h b/llvm/include/llvm/MCA/CustomBehaviour.h
new file mode 100644
--- /dev/null
+++ b/llvm/include/llvm/MCA/CustomBehaviour.h
@@ -0,0 +1,59 @@
+//===---------------------- CustomBehaviour.h -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the base class CustomBehaviour which can be inherited from
+/// by specific targets (ex. llvm/tools/llvm-mca/lib/X86CustomBehaviour.h).
+/// CustomBehaviour is designed to enforce custom behaviour and dependencies
+/// within the llvm-mca pipeline simulation that llvm-mca isn't already capable
+/// of extracting from the Scheduling Models.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MCA_CUSTOMBEHAVIOUR_H
+#define LLVM_MCA_CUSTOMBEHAVIOUR_H
+
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MCA/SourceMgr.h"
+
+namespace llvm {
+namespace mca {
+
+class CustomBehaviour {
+protected:
+  const MCSubtargetInfo &STI;
+  const SourceMgr &SrcMgr;
+  const MCInstrInfo &MCII;
+
+public:
+  CustomBehaviour(const MCSubtargetInfo &STI, const SourceMgr &SrcMgr,
+                  const MCInstrInfo &MCII)
+      : STI(STI), SrcMgr(SrcMgr), MCII(MCII) {}
+
+  virtual ~CustomBehaviour() {}
+
+  // Before the llvm-mca pipeline dispatches an instruction, it first checks
+  // for any register or resource dependencies / hazards. If it doesn't find
+  // any, this method will be invoked to determine if there are any custom
+  // hazards that the instruction needs to wait for.
+  // The return value of this method is the number of cycles that the
+  // instruction needs to wait for.
+  // It's safe to underestimate the number of cycles to wait for since these
+  // checks will be invoked again before the intruction gets dispatched.
+  // However, it's not safe (accurate) to overestimate the number of cycles
+  // to wait for since the instruction will wait for AT LEAST that number of
+  // cycles before attempting to be dispatched again.
+  virtual unsigned checkCustomHazard(const SmallVector<InstRef, 4> &IssuedInst,
+                                     const InstRef &IR);
+};
+
+} // namespace mca
+} // namespace llvm
+
+#endif /* LLVM_MCA_CUSTOMBEHAVIOUR_H */
diff --git a/llvm/include/llvm/MCA/HWEventListener.h b/llvm/include/llvm/MCA/HWEventListener.h
--- a/llvm/include/llvm/MCA/HWEventListener.h
+++ b/llvm/include/llvm/MCA/HWEventListener.h
@@ -115,6 +115,7 @@
     SchedulerQueueFull,
     LoadQueueFull,
     StoreQueueFull,
+    CustomBehaviourStall,
     LastGenericEvent
   };
 
diff --git a/llvm/include/llvm/MCA/Instruction.h b/llvm/include/llvm/MCA/Instruction.h
--- a/llvm/include/llvm/MCA/Instruction.h
+++ b/llvm/include/llvm/MCA/Instruction.h
@@ -33,6 +33,94 @@
 
 constexpr int UNKNOWN_CYCLES = -512;
 
+/// A representation of an mca::Instruction operand
+/// for use in mca::CustomBehaviour.
+class MCAOperand {
+  // This class is mostly copied from MCOperand within
+  // MCInst.h except that we don't keep track of
+  // expressions or sub-instructions.
+  enum MCAOperandType : unsigned char {
+    kInvalid,   ///< Uninitialized, Relocatable immediate, or Sub-instruction.
+    kRegister,  ///< Register operand.
+    kImmediate, ///< Immediate operand.
+    kSFPImmediate, ///< Single-floating-point immediate operand.
+    kDFPImmediate, ///< Double-Floating-point immediate operand.
+  };
+  MCAOperandType Kind = kInvalid;
+
+  union {
+    unsigned RegVal;
+    int64_t ImmVal;
+    uint32_t SFPImmVal;
+    uint64_t FPImmVal;
+  };
+
+public:
+  MCAOperand() : FPImmVal(0) {}
+
+  bool isValid() const { return Kind != kInvalid; }
+  bool isReg() const { return Kind == kRegister; }
+  bool isImm() const { return Kind == kImmediate; }
+  bool isSFPImm() const { return Kind == kSFPImmediate; }
+  bool isDFPImm() const { return Kind == kDFPImmediate; }
+
+  /// Returns the register number.
+  unsigned getReg() const {
+    assert(isReg() && "This is not a register operand!");
+    return RegVal;
+  }
+
+  int64_t getImm() const {
+    assert(isImm() && "This is not an immediate");
+    return ImmVal;
+  }
+
+  uint32_t getSFPImm() const {
+    assert(isSFPImm() && "This is not an SFP immediate");
+    return SFPImmVal;
+  }
+
+  uint64_t getDFPImm() const {
+    assert(isDFPImm() && "This is not an FP immediate");
+    return FPImmVal;
+  }
+
+  static MCAOperand createReg(unsigned Reg) {
+    MCAOperand Op;
+    Op.Kind = kRegister;
+    Op.RegVal = Reg;
+    return Op;
+  }
+
+  static MCAOperand createImm(int64_t Val) {
+    MCAOperand Op;
+    Op.Kind = kImmediate;
+    Op.ImmVal = Val;
+    return Op;
+  }
+
+  static MCAOperand createSFPImm(uint32_t Val) {
+    MCAOperand Op;
+    Op.Kind = kSFPImmediate;
+    Op.SFPImmVal = Val;
+    return Op;
+  }
+
+  static MCAOperand createDFPImm(uint64_t Val) {
+    MCAOperand Op;
+    Op.Kind = kDFPImmediate;
+    Op.FPImmVal = Val;
+    return Op;
+  }
+
+  static MCAOperand createInvalid() {
+    MCAOperand Op;
+    Op.Kind = kInvalid;
+    Op.FPImmVal = 0;
+    return Op;
+  }
+};
+
 /// A register write descriptor.
 struct WriteDescriptor {
   // Operand index. The index is negative for implicit writes only.
@@ -409,8 +497,15 @@
   // One entry per each implicit and explicit register use.
   SmallVector<ReadState, 4> Uses;
 
+  // List of operands which can be used by mca::CustomBehaviour
+  SmallVector<MCAOperand, 8> Operands;
+
+  // Instruction opcode which can be used by mca::CustomBehaviour
+  unsigned Opcode;
+
 public:
-  InstructionBase(const InstrDesc &D) : Desc(D), IsOptimizableMove(false) {}
+  InstructionBase(const InstrDesc &D, const unsigned Opcode)
+      : Desc(D), IsOptimizableMove(false), Opcode(Opcode) {}
 
   SmallVectorImpl<WriteState> &getDefs() { return Defs; }
   ArrayRef<WriteState> getDefs() const { return Defs; }
@@ -420,6 +515,17 @@
 
   unsigned getLatency() const { return Desc.MaxLatency; }
   unsigned getNumMicroOps() const { return Desc.NumMicroOps; }
+  unsigned getOpcode() const { return Opcode; }
+
+  const MCAOperand &getOperand(unsigned Idx) const { return Operands[Idx]; }
+  unsigned getNumOperands() const { return Operands.size(); }
+  void addOperand(const MCAOperand Op) { Operands.push_back(Op); }
+  using iterator = SmallVectorImpl<MCAOperand>::iterator;
+  using const_iterator = SmallVectorImpl<MCAOperand>::const_iterator;
+  iterator begin() { return Operands.begin(); }
+  const_iterator begin() const { return Operands.begin(); }
+  iterator end() { return Operands.end(); }
+  const_iterator end() const { return Operands.end(); }
 
   bool hasDependentUsers() const {
     return any_of(Defs,
@@ -490,11 +596,11 @@
   bool IsEliminated;
 
 public:
-  Instruction(const InstrDesc &D)
-      : InstructionBase(D), Stage(IS_INVALID), CyclesLeft(UNKNOWN_CYCLES),
-        RCUTokenID(0), LSUTokenID(0), UsedBuffers(D.UsedBuffers),
-        CriticalRegDep(), CriticalMemDep(), CriticalResourceMask(0),
-        IsEliminated(false) {}
+  Instruction(const InstrDesc &D, const unsigned Opcode)
+      : InstructionBase(D, Opcode), Stage(IS_INVALID),
+        CyclesLeft(UNKNOWN_CYCLES), RCUTokenID(0), LSUTokenID(0),
+        UsedBuffers(D.UsedBuffers), CriticalRegDep(), CriticalMemDep(),
+        CriticalResourceMask(0), IsEliminated(false) {}
 
   unsigned getRCUTokenID() const { return RCUTokenID; }
   unsigned getLSUTokenID() const { return LSUTokenID; }
diff --git a/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h b/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h
--- a/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h
+++ b/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_MCA_STAGES_INORDERISSUESTAGE_H
 #define LLVM_MCA_STAGES_INORDERISSUESTAGE_H
 
+#include "llvm/MCA/CustomBehaviour.h"
 #include "llvm/MCA/HardwareUnits/ResourceManager.h"
 #include "llvm/MCA/SourceMgr.h"
 #include "llvm/MCA/Stages/Stage.h"
@@ -23,7 +24,13 @@
 class RegisterFile;
 
 struct StallInfo {
-  enum class StallKind { DEFAULT, REGISTER_DEPS, DISPATCH, DELAY };
+  enum class StallKind {
+    DEFAULT,
+    REGISTER_DEPS,
+    DISPATCH,
+    DELAY,
+    CUSTOM_STALL
+  };
 
   InstRef IR;
   unsigned CyclesLeft;
@@ -46,6 +53,7 @@
   const MCSubtargetInfo &STI;
   RegisterFile &PRF;
   ResourceManager RM;
+  CustomBehaviour &CB;
 
   /// Instructions that were issued, but not executed yet.
   SmallVector<InstRef, 4> IssuedInst;
@@ -101,7 +109,8 @@
   void retireInstruction(InstRef &IR);
 
 public:
-  InOrderIssueStage(const MCSubtargetInfo &STI, RegisterFile &PRF);
+  InOrderIssueStage(const MCSubtargetInfo &STI, RegisterFile &PRF,
+                    CustomBehaviour &CB);
 
   unsigned getIssueWidth() const;
   bool isAvailable(const InstRef &) const override;
diff --git a/llvm/lib/MCA/CMakeLists.txt b/llvm/lib/MCA/CMakeLists.txt
--- a/llvm/lib/MCA/CMakeLists.txt
+++ b/llvm/lib/MCA/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_llvm_component_library(LLVMMCA
   CodeEmitter.cpp
   Context.cpp
+  CustomBehaviour.cpp
   HWEventListener.cpp
   HardwareUnits/HardwareUnit.cpp
   HardwareUnits/LSUnit.cpp
diff --git a/llvm/lib/MCA/Context.cpp b/llvm/lib/MCA/Context.cpp
--- a/llvm/lib/MCA/Context.cpp
+++ b/llvm/lib/MCA/Context.cpp
@@ -29,11 +29,12 @@
 namespace mca {
 
 std::unique_ptr<Pipeline>
-Context::createDefaultPipeline(const PipelineOptions &Opts, SourceMgr &SrcMgr) {
+Context::createDefaultPipeline(const PipelineOptions &Opts, SourceMgr &SrcMgr,
+                               CustomBehaviour &CB) {
   const MCSchedModel &SM = STI.getSchedModel();
 
   if (!SM.isOutOfOrder())
-    return createInOrderPipeline(Opts, SrcMgr);
+    return createInOrderPipeline(Opts, SrcMgr, CB);
 
   // Create the hardware units defining the backend.
   auto RCU = std::make_unique<RetireControlUnit>(SM);
@@ -69,13 +70,14 @@
 }
 
 std::unique_ptr<Pipeline>
-Context::createInOrderPipeline(const PipelineOptions &Opts, SourceMgr &SrcMgr) {
+Context::createInOrderPipeline(const PipelineOptions &Opts, SourceMgr &SrcMgr,
+                               CustomBehaviour &CB) {
   const MCSchedModel &SM = STI.getSchedModel();
   auto PRF = std::make_unique<RegisterFile>(SM, MRI, Opts.RegisterFileSize);
 
   // Create the pipeline stages.
   auto Entry = std::make_unique<EntryStage>(SrcMgr);
-  auto InOrderIssue = std::make_unique<InOrderIssueStage>(STI, *PRF);
+  auto InOrderIssue = std::make_unique<InOrderIssueStage>(STI, *PRF, CB);
   auto StagePipeline = std::make_unique<Pipeline>();
 
   // Pass the ownership of all the hardware units to this Context.
diff --git a/llvm/lib/MCA/CustomBehaviour.cpp b/llvm/lib/MCA/CustomBehaviour.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/MCA/CustomBehaviour.cpp
@@ -0,0 +1,27 @@
+//===--------------------- CustomBehaviour.cpp ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements methods from the CustomBehaviour interface.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MCA/CustomBehaviour.h"
+
+namespace llvm {
+namespace mca {
+
+unsigned
+CustomBehaviour::checkCustomHazard(const SmallVector<InstRef, 4> &IssuedInst,
+                                   const InstRef &IR) {
+  // 0 signifies that there are no hazards that need to be waited on
+  return 0;
+}
+
+} // namespace mca
+} // namespace llvm
diff --git a/llvm/lib/MCA/InstrBuilder.cpp b/llvm/lib/MCA/InstrBuilder.cpp
--- a/llvm/lib/MCA/InstrBuilder.cpp
+++ b/llvm/lib/MCA/InstrBuilder.cpp
@@ -616,7 +616,26 @@
   if (!DescOrErr)
     return DescOrErr.takeError();
   const InstrDesc &D = *DescOrErr;
-  std::unique_ptr<Instruction> NewIS = std::make_unique<Instruction>(D);
+  std::unique_ptr<Instruction> NewIS =
+      std::make_unique<Instruction>(D, MCI.getOpcode());
+
+  // Build the list of operands to be used by mca::CustomBehaviour
+  for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) {
+    MCAOperand Op;
+    const MCOperand &MCOp = MCI.getOperand(Idx);
+    if (MCOp.isReg()) {
+      Op = MCAOperand::createReg(MCOp.getReg());
+    } else if (MCOp.isImm()) {
+      Op = MCAOperand::createImm(MCOp.getImm());
+    } else if (MCOp.isSFPImm()) {
+      Op = MCAOperand::createSFPImm(MCOp.getSFPImm());
+    } else if (MCOp.isDFPImm()) {
+      Op = MCAOperand::createDFPImm(MCOp.getDFPImm());
+    } else {
+      Op = MCAOperand::createInvalid();
+    }
+    NewIS->addOperand(Op);
+  }
 
   // Check if this is a dependency breaking instruction.
   APInt Mask;
diff --git a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp
--- a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp
+++ b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp
@@ -43,8 +43,8 @@
 }
 
 InOrderIssueStage::InOrderIssueStage(const MCSubtargetInfo &STI,
-                                     RegisterFile &PRF)
-    : STI(STI), PRF(PRF), RM(STI.getSchedModel()), NumIssued(), SI(),
+                                     RegisterFile &PRF, CustomBehaviour &CB)
+    : STI(STI), PRF(PRF), RM(STI.getSchedModel()), CB(CB), NumIssued(), SI(),
       CarryOver(), Bandwidth(), LastWriteBackCycle() {}
 
 unsigned InOrderIssueStage::getIssueWidth() const {
@@ -125,6 +125,11 @@
     return false;
   }
 
+  if (unsigned CustomStallCycles = CB.checkCustomHazard(IssuedInst, IR)) {
+    SI.update(IR, CustomStallCycles, StallInfo::StallKind::CUSTOM_STALL);
+    return false;
+  }
+
   if (LastWriteBackCycle) {
     if (!IR.getInstruction()->getDesc().RetireOOO) {
       unsigned NextWriteBackCycle = findFirstWriteBackCycle(IR);
@@ -333,6 +338,11 @@
         HWPressureEvent(HWPressureEvent::RESOURCES, IR));
     break;
   }
+  case StallInfo::StallKind::CUSTOM_STALL: {
+    notifyEvent<HWStallEvent>(
+        HWStallEvent(HWStallEvent::CustomBehaviourStall, IR));
+    break;
+  }
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -20,6 +20,7 @@
 def WriteExport : SchedWrite;
 def WriteLDS    : SchedWrite;
 def WriteSALU   : SchedWrite;
+def WriteCBSALU : SchedWrite; // RetireOOO flag for llvm-mca (s_waitcnt instrs)
 def WriteSMEM   : SchedWrite;
 def WriteVMEM   : SchedWrite;
 def WriteBarrier : SchedWrite;
@@ -256,10 +257,13 @@
 def : HWWriteRes<WriteExport,        [HWExport, HWRC], 16>;
 def : HWWriteRes<WriteLDS,           [HWLGKM,   HWRC], 20>;
 def : HWWriteRes<WriteSALU,          [HWSALU,   HWRC], 2>;
+let RetireOOO = 1 in
+def : HWWriteRes<WriteCBSALU,        [HWSALU,   HWRC], 2>;
 def : HWWriteRes<WriteSMEM,          [HWLGKM,   HWRC], 20>;
 def : HWWriteRes<WriteVMEM,          [HWVMEM,   HWRC], 320>;
 def : HWWriteRes<WriteBarrier,       [HWBranch],       2000>;
 
 def : InstRW<[WriteCopy], (instrs COPY)>;
+def : InstRW<[WriteCBSALU], (instregex "S_WAITCNT.*")>;
 
 }  // End SchedModel = GFX10SpeedModel
diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx10-waitcnt-custom-behaviour.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx10-waitcnt-custom-behaviour.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx10-waitcnt-custom-behaviour.s
@@ -0,0 +1,10 @@
+# RUN: llvm-mca -mtriple=amdgcn -mcpu=gfx1010 -dispatch-stats=false -scheduler-stats=false < %s | FileCheck %s
+
+s_load_dwordx4 s[0:3], s[4:5], 0x0
+v_mov_b32_e32 v0, 0
+s_waitcnt vmcnt(0) lgkmcnt(0)
+s_waitcnt_vscnt null, 0x0
+global_load_dword v1, v0, s[0:1] glc dlc
+s_waitcnt vmcnt(0)
+
+# CHECK: Total Cycles:      34302
diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx10-waitcnt-depctr-unsupported-warning.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx10-waitcnt-depctr-unsupported-warning.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx10-waitcnt-depctr-unsupported-warning.s
@@ -0,0 +1,20 @@
+# RUN: llvm-mca -mtriple=amdgcn -mcpu=gfx1010 -iterations=3 -dispatch-stats=false -scheduler-stats=false < %s 2>&1 | FileCheck %s
+
+v_mul_hi_u32 v2, s3, v0
+v_mul_lo_u32 v2, v2, s2
+v_sub_nc_u32_e32 v2, s3, v2
+s_add_i32 s3, s3, 1
+v_subrev_nc_u32_e32 v3, s2, v2
+v_cmp_le_u32_e32 vcc_lo, s2, v2
+v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+v_subrev_nc_u32_e32 v3, s2, v2
+v_cmp_le_u32_e32 vcc_lo, s2, v2
+v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+global_store_dword v1, v2, s[0:1]
+s_waitcnt_depctr 0xffe3
+s_add_u32 s0, s0, 4
+s_addc_u32 s1, s1, 0
+s_cmpk_eq_i32 s3, 0x400
+s_cbranch_scc0 BB3_1
+
+# CHECK: warning: Instruction s_waitcnt_depctr currently unsupported and will not trigger a wait.
diff --git a/llvm/tools/llvm-mca/CMakeLists.txt b/llvm/tools/llvm-mca/CMakeLists.txt
--- a/llvm/tools/llvm-mca/CMakeLists.txt
+++ b/llvm/tools/llvm-mca/CMakeLists.txt
@@ -1,5 +1,7 @@
 include_directories(include)
 
+add_subdirectory(lib)
+
 set(LLVM_LINK_COMPONENTS
   AllTargetsAsmParsers
   AllTargetsDescs
@@ -30,3 +32,7 @@
   )
 
 set(LLVM_MCA_SOURCE_DIR ${CURRENT_SOURCE_DIR})
+
+target_link_libraries(llvm-mca PRIVATE
+  ${LLVM_MCA_CUSTOMBEHAVIOUR_TARGETS}
+  )
diff --git a/llvm/tools/llvm-mca/Views/DispatchStatistics.cpp b/llvm/tools/llvm-mca/Views/DispatchStatistics.cpp
--- a/llvm/tools/llvm-mca/Views/DispatchStatistics.cpp
+++ b/llvm/tools/llvm-mca/Views/DispatchStatistics.cpp
@@ -77,6 +77,8 @@
   printStalls(SS, HWStalls[HWStallEvent::StoreQueueFull], NumCycles);
   SS << "\nGROUP   - Static restrictions on the dispatch group: ";
   printStalls(SS, HWStalls[HWStallEvent::DispatchGroupStall], NumCycles);
+  SS << "\nCB      - Custom Behaviour stall:                    ";
+  printStalls(SS, HWStalls[HWStallEvent::CustomBehaviourStall], NumCycles);
   SS << '\n';
   SS.flush();
   OS << Buffer;
diff --git a/llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.h b/llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.h
new file mode 100644
--- /dev/null
+++ b/llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.h
@@ -0,0 +1,97 @@
+//===------------------- AMDGPUCustomBehaviour.h ----------------*-C++ -* -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the AMDGPUCustomBehaviour class which inherits from
+/// CustomBehaviour.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_AMDGPUCUSTOMBEHAVIOUR_H
+#define LLVM_TOOLS_LLVM_MCA_AMDGPUCUSTOMBEHAVIOUR_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MCA/CustomBehaviour.h"
+#include "llvm/Support/TargetParser.h"
+
+namespace llvm {
+namespace mca {
+
+struct WaitCntInfo {
+  bool VM_CNT = false;
+  bool EXP_CNT = false;
+  bool LGKM_CNT = false;
+  bool VS_CNT = false;
+};
+
+class AMDGPUCustomBehaviour : public CustomBehaviour {
+  /// Whenever MCA would like to dispatch an s_waitcnt instructions,
+  /// we must check all the instruction that are still executing to see if
+  /// they modify the same CNT as we need to wait for. This vector
+  /// gets built in the constructor and contains 1 WaitCntInfo struct
+  /// for each instruction within the SrcManager. Each element
+  /// tells us which CNTs that instruction may interact with.
+  /// We conservatively assume some instructions interact with more
+  /// CNTs than they do in reality, so we will occasionally wait
+  /// longer than necessary, but we shouldn't ever wait for shorter.
+  std::vector<WaitCntInfo> InstrWaitCntInfo;
+
+  /// This method gets called from the constructor and is
+  /// where we setup the InstrWaitCntInfo vector.
+  /// The core logic for determining which CNTs an instruction
+  /// interacts with is taken from SIInsertWaitcnts::updateEventWaitcntAfter().
+  /// Unfortunately, some of the logic from that function is not avalable to us
+  /// in this scope so we conservatively end up assuming that some
+  /// instructions interact with more CNTs than they do in reality.
+  void generateWaitCntInfo();
+  /// Helper function used in generateWaitCntInfo()
+  bool hasModifiersSet(const std::unique_ptr<Instruction> &Inst,
+                       unsigned OpName) const;
+  /// Helper function used in generateWaitCntInfo()
+  bool isAlwaysGDS(uint16_t Opcode) const;
+  /// Helper function used in generateWaitCntInfo()
+  bool isVMEM(const MCInstrDesc &MCID);
+  /// This method gets called from checkCustomHazard when mca is attempting to
+  /// dispatch an s_waitcnt instruction (or one of its variants). The method
+  /// looks at each of the instructions that are still executing in the pipeline
+  /// to determine if the waitcnt should force a wait.
+  unsigned handleWaitCnt(const SmallVector<InstRef, 4> &IssuedInst,
+                         const InstRef &IR);
+  /// Based on the type of s_waitcnt instruction we are looking at, and what its
+  /// operands are, this method will set the values for each of the cnt
+  /// references provided as arguments.
+  void computeWaitCnt(const InstRef &IR, unsigned &Vmcnt, unsigned &Expcnt,
+                      unsigned &Lgkmcnt, unsigned &Vscnt);
+
+  /// Some instructions aren't automatically detected as interacting with any of
+  /// the CNTs even though they should. This method can be used to manually set
+  /// the WaitCntInfo for those instructions.
+  bool manuallySetWaitCntInfo(const std::unique_ptr<Instruction> &Inst,
+                              const int Index);
+
+public:
+  AMDGPUCustomBehaviour(const MCSubtargetInfo &STI, const SourceMgr &SrcMgr,
+                        const MCInstrInfo &MCII);
+
+  ~AMDGPUCustomBehaviour() {}
+
+  /// This method is used to determine if an instruction
+  /// should be allowed to be dispatched. The return value is
+  /// how many cycles until the instruction can be dispatched.
+  /// This method is called after MCA has already checked for
+  /// register and hardware dependencies so this method should only
+  /// implement custom behaviour and dependencies that are not picked up
+  /// by MCA naturally.
+  unsigned checkCustomHazard(const SmallVector<InstRef, 4> &IssuedInst,
+                             const InstRef &IR) override;
+};
+
+} // namespace mca
+} // namespace llvm
+
+#endif /* LLVM_TOOLS_LLVM_MCA_AMDGPUCUSTOMBEHAVIOUR_H */
diff --git a/llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.cpp b/llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.cpp
@@ -0,0 +1,426 @@
+//===------------------ AMDGPUCustomBehaviour.cpp ---------------*-C++ -* -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements methods from the AMDGPUCustomBehaviour class.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUCustomBehaviour.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/Support/WithColor.h"
+
+namespace llvm {
+namespace mca {
+
+AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
+                                             const SourceMgr &SrcMgr,
+                                             const MCInstrInfo &MCII)
+    : CustomBehaviour(STI, SrcMgr, MCII) {
+  generateWaitCntInfo();
+}
+
+unsigned AMDGPUCustomBehaviour::checkCustomHazard(
+    const SmallVector<InstRef, 4> &IssuedInst, const InstRef &IR) {
+  const Instruction &Inst = *IR.getInstruction();
+  unsigned Opcode = Inst.getOpcode();
+
+  switch (Opcode) {
+  default:
+    return 0;
+  case AMDGPU::S_WAITCNT:
+  case AMDGPU::S_WAITCNT_DEPCTR:
+  case AMDGPU::S_WAITCNT_EXPCNT:
+  case AMDGPU::S_WAITCNT_LGKMCNT:
+  case AMDGPU::S_WAITCNT_VMCNT:
+  case AMDGPU::S_WAITCNT_VSCNT:
+    assert(false &&
+           "These are pseudo instructions and should never appear in asm.");
+    return 0;
+  case AMDGPU::S_WAITCNT_DEPCTR_gfx10:
+    // Can't find any documentation in the ISAs about what this
+    // instruction does so I'm unsure of how to model it.
+    WithColor::warning() << "Instruction s_waitcnt_depctr "
+                         << "currently unsupported and will not "
+                         << "trigger a wait.\n";
+    return 0;
+  case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
+  case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
+  case AMDGPU::S_WAITCNT_VMCNT_gfx10:
+  case AMDGPU::S_WAITCNT_VSCNT_gfx10:
+  case AMDGPU::S_WAITCNT_gfx10:
+  case AMDGPU::S_WAITCNT_gfx6_gfx7:
+  case AMDGPU::S_WAITCNT_vi:
+    // s_endpgm also behaves as if there is an implicit
+    // s_waitcnt 0, but I'm not sure if it would be appropriate
+    // to model this in llvm-mca based on how the iterations work
+    // while simulating the pipeline over and over.
+    return handleWaitCnt(IssuedInst, IR);
+  }
+
+  return 0;
+}
+
+unsigned
+AMDGPUCustomBehaviour::handleWaitCnt(const SmallVector<InstRef, 4> &IssuedInst,
+                                     const InstRef &IR) {
+  // set the max values to begin
+  unsigned Vmcnt = 63;
+  unsigned Expcnt = 7;
+  unsigned Lgkmcnt = 31;
+  unsigned Vscnt = 63;
+  unsigned CurrVmcnt = 0;
+  unsigned CurrExpcnt = 0;
+  unsigned CurrLgkmcnt = 0;
+  unsigned CurrVscnt = 0;
+  unsigned CyclesToWaitVm = ~0U;
+  unsigned CyclesToWaitExp = ~0U;
+  unsigned CyclesToWaitLgkm = ~0U;
+  unsigned CyclesToWaitVs = ~0U;
+
+  computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt);
+
+  // We will now look at each of the currently executing instructions
+  // to find out if this wait instruction still needs to wait.
+  for (auto I = IssuedInst.begin(), E = IssuedInst.end(); I != E; I++) {
+    const InstRef &PrevIR = *I;
+    const Instruction &PrevInst = *PrevIR.getInstruction();
+    const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size();
+    const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex];
+    const int CyclesLeft = PrevInst.getCyclesLeft();
+    assert(CyclesLeft != UNKNOWN_CYCLES &&
+           "We should know how many cycles are left for this instruction");
+    if (PrevInstWaitInfo.VM_CNT) {
+      CurrVmcnt++;
+      if (CyclesLeft < CyclesToWaitVm)
+        CyclesToWaitVm = CyclesLeft;
+    }
+    if (PrevInstWaitInfo.EXP_CNT) {
+      CurrExpcnt++;
+      if (CyclesLeft < CyclesToWaitExp)
+        CyclesToWaitExp = CyclesLeft;
+    }
+    if (PrevInstWaitInfo.LGKM_CNT) {
+      CurrLgkmcnt++;
+      if (CyclesLeft < CyclesToWaitLgkm)
+        CyclesToWaitLgkm = CyclesLeft;
+    }
+    if (PrevInstWaitInfo.VS_CNT) {
+      CurrVscnt++;
+      if (CyclesLeft < CyclesToWaitVs)
+        CyclesToWaitVs = CyclesLeft;
+    }
+  }
+
+  unsigned CyclesToWait = ~0U;
+  if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait)
+    CyclesToWait = CyclesToWaitVm;
+  if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait)
+    CyclesToWait = CyclesToWaitExp;
+  if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait)
+    CyclesToWait = CyclesToWaitLgkm;
+  if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait)
+    CyclesToWait = CyclesToWaitVs;
+
+  // We may underestimate how many cycles we need to wait, but this
+  // isn't a big deal. Our return value is just how many cycles until
+  // this function gets run again. So as long as we don't overestimate
+  // the wait time, we'll still end up stalling at this instruction
+  // for the correct number of cycles.
+
+  if (CyclesToWait == ~0U)
+    return 0;
+  return CyclesToWait;
+}
+
+void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt,
+                                           unsigned &Expcnt, unsigned &Lgkmcnt,
+                                           unsigned &Vscnt) {
+  AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
+  const Instruction &Inst = *IR.getInstruction();
+  unsigned Opcode = Inst.getOpcode();
+
+  switch (Opcode) {
+  case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
+  case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
+  case AMDGPU::S_WAITCNT_VMCNT_gfx10:
+  case AMDGPU::S_WAITCNT_VSCNT_gfx10: {
+    const MCAOperand &OpReg = Inst.getOperand(0);
+    const MCAOperand &OpImm = Inst.getOperand(1);
+    assert(OpReg.isReg() && "First operand should be a register.");
+    assert(OpImm.isImm() && "Second operand should be an immediate.");
+    if (OpReg.getReg() != AMDGPU::SGPR_NULL) {
+      // Instruction is using a real register.
+      // Since we can't know what value this register will have,
+      // we can't compute what the value of this wait should be.
+      WithColor::warning() << "The register component of "
+                           << MCII.getName(Opcode) << " will be completely "
+                           << "ignored. So the wait may not be accurate.\n";
+    }
+    switch (Opcode) {
+    // Redundant switch so I don't have to repeat the code above
+    // for each case. There are more clever ways to avoid this
+    // extra switch and anyone can feel free to implement one of them.
+    case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
+      Expcnt = OpImm.getImm();
+      break;
+    case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
+      Lgkmcnt = OpImm.getImm();
+      break;
+    case AMDGPU::S_WAITCNT_VMCNT_gfx10:
+      Vmcnt = OpImm.getImm();
+      break;
+    case AMDGPU::S_WAITCNT_VSCNT_gfx10:
+      Vscnt = OpImm.getImm();
+      break;
+    }
+    return;
+  }
+  case AMDGPU::S_WAITCNT_gfx10:
+  case AMDGPU::S_WAITCNT_gfx6_gfx7:
+  case AMDGPU::S_WAITCNT_vi:
+    unsigned WaitCnt = Inst.getOperand(0).getImm();
+    AMDGPU::decodeWaitcnt(IV, WaitCnt, Vmcnt, Expcnt, Lgkmcnt);
+    return;
+  }
+}
+
+void AMDGPUCustomBehaviour::generateWaitCntInfo() {
+  // The core logic from this function is taken from
+  // SIInsertWaitcnts::updateEventWaitcntAfter()
+  // In that pass, some instructions that are being looked at are
+  // Pseudo instructions. The Pseudo instructions have the proper
+  // mayLoad and mayStore flags associated with them, but the
+  // 'real' instructions that we encounter in this function
+  // may not have their mayLoad and mayStore flags set.
+  // For example, S_LOAD_DWORDX2_IMM (op 2293) will be seen
+  // in SIInsertWaitcnts.cpp, but after llc finishes compiling
+  // the assembly source, the instruction will become
+  // S_LOAD_DWORDX2_IMM_si (op 14208).
+  // S_LOAD_DWORDX2_IMM has the mayLoad flag.
+  // S_LOAD_DWORDX2_IMM_si does not have the mayLoad flag.
+  // For this reason, the control flow has been modified in this
+  // function to conservatively assume that some instructions
+  // interact with more CNTs than they should. This will
+  // result in occasionally waiting longer than necessary
+  // at a s_waitcnt instruction.
+  InstrWaitCntInfo.resize(SrcMgr.size());
+
+  int Index = 0;
+  for (auto I = SrcMgr.begin(), E = SrcMgr.end(); I != E; ++I, ++Index) {
+    const std::unique_ptr<Instruction> &Inst = *I;
+    // Due to the flags of 'real' instructions not being set properly,
+    // some instructions are (incorrectly) not detected as interacting
+    // with any CNTs. This function will manually handle those instructions
+    // that we are aware of.
+    if (manuallySetWaitCntInfo(Inst, Index))
+      continue;
+    unsigned Opcode = Inst->getOpcode();
+    const MCInstrDesc &MCID = MCII.get(Opcode);
+    if ((MCID.TSFlags & SIInstrFlags::DS) &&
+        (MCID.TSFlags & SIInstrFlags::LGKM_CNT)) {
+      InstrWaitCntInfo[Index].LGKM_CNT = true;
+      if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, AMDGPU::OpName::gds))
+        InstrWaitCntInfo[Index].EXP_CNT = true;
+    } else if (MCID.TSFlags & SIInstrFlags::FLAT) {
+      // We conservatively assume that mayAccessVMEMThroughFlat(Inst)
+      // and mayAccessLDSThroughFlat(Inst) would both return true for this
+      // instruction.
+      InstrWaitCntInfo[Index].LGKM_CNT = true;
+      if (!STI.hasFeature(AMDGPU::FeatureVscnt))
+        InstrWaitCntInfo[Index].VM_CNT = true;
+      else if (!(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) {
+        // else if (MCID.mayLoad() && !(MCID.TSFlags &
+        // SIInstrFlags::IsAtomicNoRet))
+        //   InstrWaitCntInfo[Index].VM_CNT = true;
+        InstrWaitCntInfo[Index].VM_CNT = true;
+        InstrWaitCntInfo[Index].VS_CNT = true;
+      } else
+        InstrWaitCntInfo[Index].VS_CNT = true;
+    } else if (isVMEM(MCID) && Opcode != AMDGPU::BUFFER_WBINVL1 &&
+               Opcode != AMDGPU::BUFFER_WBINVL1_SC &&
+               Opcode != AMDGPU::BUFFER_WBINVL1_VOL &&
+               Opcode != AMDGPU::BUFFER_GL0_INV &&
+               Opcode != AMDGPU::BUFFER_GL1_INV) {
+      if (!STI.hasFeature(AMDGPU::FeatureVscnt))
+        InstrWaitCntInfo[Index].VM_CNT = true;
+      else if (!(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet) ||
+               (MCID.TSFlags & SIInstrFlags::MIMG)) {
+        // else if ((MCID.mayLoad() && !(MCID.TSFlags &
+        // SIInstrFlags::IsAtomicNoRet)) ||
+        //       (MCID.TSFlags & SIInstrFlags::MIMG &&
+        //        !MCID.mayLoad() && !MCID.mayStore()))
+        //  InstrWaitCntInfo[Index].VM_CNT = true;
+        InstrWaitCntInfo[Index].VM_CNT = true;
+        InstrWaitCntInfo[Index].VS_CNT = true;
+      } else
+        // else if (MCID.mayStore())
+        InstrWaitCntInfo[Index].VS_CNT = true;
+
+      // Conservatively assume that
+      // GCNSubtarget::vmemWriteNeedsExpWaitcnt() would return true
+
+      // if (MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet))
+      InstrWaitCntInfo[Index].EXP_CNT = true;
+    } else if (MCID.TSFlags & SIInstrFlags::SMRD) {
+      InstrWaitCntInfo[Index].LGKM_CNT = true;
+    } else if (MCID.TSFlags & SIInstrFlags::EXP) {
+      InstrWaitCntInfo[Index].EXP_CNT = true;
+    } else {
+      switch (Opcode) {
+      case AMDGPU::S_SENDMSG:
+      case AMDGPU::S_SENDMSGHALT:
+      case AMDGPU::S_MEMTIME:
+      case AMDGPU::S_MEMREALTIME:
+        InstrWaitCntInfo[Index].LGKM_CNT = true;
+        break;
+      }
+    }
+  }
+}
+
+bool AMDGPUCustomBehaviour::manuallySetWaitCntInfo(
+    const std::unique_ptr<Instruction> &Inst, const int Index) {
+  switch (Inst->getOpcode()) {
+  // scroll wheel workout
+  case AMDGPU::DS_READ2ST64_B32_gfx10:
+  case AMDGPU::DS_READ2ST64_B32_gfx6_gfx7:
+  case AMDGPU::DS_READ2ST64_B32_vi:
+  case AMDGPU::DS_READ2ST64_B64_gfx10:
+  case AMDGPU::DS_READ2ST64_B64_gfx6_gfx7:
+  case AMDGPU::DS_READ2ST64_B64_vi:
+  case AMDGPU::DS_READ2_B32_gfx10:
+  case AMDGPU::DS_READ2_B32_gfx6_gfx7:
+  case AMDGPU::DS_READ2_B32_vi:
+  case AMDGPU::DS_READ2_B64_gfx10:
+  case AMDGPU::DS_READ2_B64_gfx6_gfx7:
+  case AMDGPU::DS_READ2_B64_vi:
+  case AMDGPU::DS_READ_ADDTID_B32_gfx10:
+  case AMDGPU::DS_READ_ADDTID_B32_vi:
+  case AMDGPU::DS_READ_B128_gfx10:
+  case AMDGPU::DS_READ_B128_gfx7:
+  case AMDGPU::DS_READ_B128_vi:
+  case AMDGPU::DS_READ_B32_gfx10:
+  case AMDGPU::DS_READ_B32_gfx6_gfx7:
+  case AMDGPU::DS_READ_B32_vi:
+  case AMDGPU::DS_READ_B64_gfx10:
+  case AMDGPU::DS_READ_B64_gfx6_gfx7:
+  case AMDGPU::DS_READ_B64_vi:
+  case AMDGPU::DS_READ_B96_gfx10:
+  case AMDGPU::DS_READ_B96_gfx7:
+  case AMDGPU::DS_READ_B96_vi:
+  case AMDGPU::DS_READ_I16_gfx10:
+  case AMDGPU::DS_READ_I16_gfx6_gfx7:
+  case AMDGPU::DS_READ_I16_vi:
+  case AMDGPU::DS_READ_I8_D16_HI_gfx10:
+  case AMDGPU::DS_READ_I8_D16_HI_vi:
+  case AMDGPU::DS_READ_I8_D16_gfx10:
+  case AMDGPU::DS_READ_I8_D16_vi:
+  case AMDGPU::DS_READ_I8_gfx10:
+  case AMDGPU::DS_READ_I8_gfx6_gfx7:
+  case AMDGPU::DS_READ_I8_vi:
+  case AMDGPU::DS_READ_U16_D16_HI_gfx10:
+  case AMDGPU::DS_READ_U16_D16_HI_vi:
+  case AMDGPU::DS_READ_U16_D16_gfx10:
+  case AMDGPU::DS_READ_U16_D16_vi:
+  case AMDGPU::DS_READ_U16_gfx10:
+  case AMDGPU::DS_READ_U16_gfx6_gfx7:
+  case AMDGPU::DS_READ_U16_vi:
+  case AMDGPU::DS_READ_U8_D16_HI_gfx10:
+  case AMDGPU::DS_READ_U8_D16_HI_vi:
+  case AMDGPU::DS_READ_U8_D16_gfx10:
+  case AMDGPU::DS_READ_U8_D16_vi:
+  case AMDGPU::DS_READ_U8_gfx10:
+  case AMDGPU::DS_READ_U8_gfx6_gfx7:
+  case AMDGPU::DS_READ_U8_vi:
+  case AMDGPU::DS_WRITE2ST64_B32_gfx10:
+  case AMDGPU::DS_WRITE2ST64_B32_gfx6_gfx7:
+  case AMDGPU::DS_WRITE2ST64_B32_vi:
+  case AMDGPU::DS_WRITE2ST64_B64_gfx10:
+  case AMDGPU::DS_WRITE2ST64_B64_gfx6_gfx7:
+  case AMDGPU::DS_WRITE2ST64_B64_vi:
+  case AMDGPU::DS_WRITE2_B32_gfx10:
+  case AMDGPU::DS_WRITE2_B32_gfx6_gfx7:
+  case AMDGPU::DS_WRITE2_B32_vi:
+  case AMDGPU::DS_WRITE2_B64_gfx10:
+  case AMDGPU::DS_WRITE2_B64_gfx6_gfx7:
+  case AMDGPU::DS_WRITE2_B64_vi:
+  case AMDGPU::DS_WRITE_ADDTID_B32_gfx10:
+  case AMDGPU::DS_WRITE_ADDTID_B32_vi:
+  case AMDGPU::DS_WRITE_B128_gfx10:
+  case AMDGPU::DS_WRITE_B128_gfx7:
+  case AMDGPU::DS_WRITE_B128_vi:
+  case AMDGPU::DS_WRITE_B16_D16_HI_gfx10:
+  case AMDGPU::DS_WRITE_B16_D16_HI_vi:
+  case AMDGPU::DS_WRITE_B16_gfx10:
+  case AMDGPU::DS_WRITE_B16_gfx6_gfx7:
+  case AMDGPU::DS_WRITE_B16_vi:
+  case AMDGPU::DS_WRITE_B32_gfx10:
+  case AMDGPU::DS_WRITE_B32_gfx6_gfx7:
+  case AMDGPU::DS_WRITE_B32_vi:
+  case AMDGPU::DS_WRITE_B64_gfx10:
+  case AMDGPU::DS_WRITE_B64_gfx6_gfx7:
+  case AMDGPU::DS_WRITE_B64_vi:
+  case AMDGPU::DS_WRITE_B8_D16_HI_gfx10:
+  case AMDGPU::DS_WRITE_B8_D16_HI_vi:
+  case AMDGPU::DS_WRITE_B8_gfx10:
+  case AMDGPU::DS_WRITE_B8_gfx6_gfx7:
+  case AMDGPU::DS_WRITE_B8_vi:
+  case AMDGPU::DS_WRITE_B96_gfx10:
+  case AMDGPU::DS_WRITE_B96_gfx7:
+  case AMDGPU::DS_WRITE_B96_vi:
+  case AMDGPU::DS_WRITE_SRC2_B32_gfx10:
+  case AMDGPU::DS_WRITE_SRC2_B32_gfx6_gfx7:
+  case AMDGPU::DS_WRITE_SRC2_B32_vi:
+  case AMDGPU::DS_WRITE_SRC2_B64_gfx10:
+  case AMDGPU::DS_WRITE_SRC2_B64_gfx6_gfx7:
+  case AMDGPU::DS_WRITE_SRC2_B64_vi:
+    // The ds_read and ds_write instructions
+    // are not automatically detected as interacting with
+    // lgkmcnt due to their flags not being ported from
+    // the Psuedo instructions to the 'real' instructions.
+    InstrWaitCntInfo[Index].LGKM_CNT = true;
+    return true;
+  }
+
+  return false;
+}
+
+// taken from SIInstrInfo::isVMEM()
+bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) {
+  return MCID.TSFlags & SIInstrFlags::MUBUF ||
+         MCID.TSFlags & SIInstrFlags::MTBUF ||
+         MCID.TSFlags & SIInstrFlags::MIMG;
+}
+
+// taken from SIInstrInfo::hasModifiersSet()
+bool AMDGPUCustomBehaviour::hasModifiersSet(
+    const std::unique_ptr<Instruction> &Inst, unsigned OpName) const {
+  int Idx = AMDGPU::getNamedOperandIdx(Inst->getOpcode(), OpName);
+  if (Idx == -1)
+    return false;
+
+  const MCAOperand &Op = Inst->getOperand(Idx);
+  if (!Op.isImm() || !Op.getImm())
+    return false;
+
+  return true;
+}
+
+// taken from SIInstrInfo::isAlwaysGDS()
+bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const {
+  return Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::DS_GWS_INIT ||
+         Opcode == AMDGPU::DS_GWS_SEMA_V || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
+         Opcode == AMDGPU::DS_GWS_SEMA_P ||
+         Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL ||
+         Opcode == AMDGPU::DS_GWS_BARRIER;
+}
+
+} // namespace mca
+} // namespace llvm
diff --git a/llvm/tools/llvm-mca/lib/AMDGPU/CMakeLists.txt b/llvm/tools/llvm-mca/lib/AMDGPU/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/llvm/tools/llvm-mca/lib/AMDGPU/CMakeLists.txt
@@ -0,0 +1,17 @@
+include_directories(
+  ${LLVM_MAIN_SRC_DIR}/lib/Target/AMDGPU
+  ${LLVM_BINARY_DIR}/lib/Target/AMDGPU
+  )
+
+set(LLVM_LINK_COMPONENTS
+  AMDGPU
+  Core
+  Support
+  )
+
+add_llvm_library(LLVMMCACustomBehaviourAMDGPU
+  AMDGPUCustomBehaviour.cpp
+
+  DEPENDS
+  AMDGPUCommonTableGen
+  )
diff --git a/llvm/tools/llvm-mca/lib/CMakeLists.txt b/llvm/tools/llvm-mca/lib/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/llvm/tools/llvm-mca/lib/CMakeLists.txt
@@ -0,0 +1,12 @@
+set(TARGETS_TO_APPEND "")
+
+if (LLVM_TARGETS_TO_BUILD MATCHES "X86")
+  add_subdirectory(X86)
+  list(APPEND TARGETS_TO_APPEND LLVMMCACustomBehaviourX86)
+endif()
+if (LLVM_TARGETS_TO_BUILD MATCHES "AMDGPU")
+  add_subdirectory(AMDGPU)
+  list(APPEND TARGETS_TO_APPEND LLVMMCACustomBehaviourAMDGPU)
+endif()
+
+set(LLVM_MCA_CUSTOMBEHAVIOUR_TARGETS ${TARGETS_TO_APPEND} PARENT_SCOPE)
diff --git a/llvm/tools/llvm-mca/lib/X86/CMakeLists.txt b/llvm/tools/llvm-mca/lib/X86/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/llvm/tools/llvm-mca/lib/X86/CMakeLists.txt
@@ -0,0 +1,17 @@
+include_directories(
+  ${LLVM_MAIN_SRC_DIR}/lib/Target/X86
+  ${LLVM_BINARY_DIR}/lib/Target/X86
+  )
+
+set(LLVM_LINK_COMPONENTS
+  X86
+  Core
+  Support
+  )
+
+add_llvm_library(LLVMMCACustomBehaviourX86
+  X86CustomBehaviour.cpp
+
+  DEPENDS
+  X86CommonTableGen
+  )
diff --git a/llvm/tools/llvm-mca/lib/X86/X86CustomBehaviour.h b/llvm/tools/llvm-mca/lib/X86/X86CustomBehaviour.h
new file mode 100644
--- /dev/null
+++ b/llvm/tools/llvm-mca/lib/X86/X86CustomBehaviour.h
@@ -0,0 +1,41 @@
+//===-------------------- X86CustomBehaviour.h ------------------*-C++ -* -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the X86CustomBehaviour class which inherits from
+/// CustomBehaviour.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_X86CUSTOMBEHAVIOUR_H
+#define LLVM_TOOLS_LLVM_MCA_X86CUSTOMBEHAVIOUR_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MCA/CustomBehaviour.h"
+
+namespace llvm {
+namespace mca {
+
+class X86CustomBehaviour : public CustomBehaviour {
+public:
+  X86CustomBehaviour(const MCSubtargetInfo &STI, const SourceMgr &SrcMgr,
+                     const MCInstrInfo &MCII)
+      : CustomBehaviour(STI, SrcMgr, MCII) {}
+
+  ~X86CustomBehaviour() {}
+
+  // The purpose of this method can be seen in
+  // llvm/include/MCA/CustomBehaviour.h
+  unsigned checkCustomHazard(const SmallVector<InstRef, 4> &IssuedInst,
+                             const InstRef &IR) override;
+};
+
+} // namespace mca
+} // namespace llvm
+
+#endif /* LLVM_TOOLS_LLVM_MCA_X86CUSTOMBEHAVIOUR_H */
diff --git a/llvm/tools/llvm-mca/lib/X86/X86CustomBehaviour.cpp b/llvm/tools/llvm-mca/lib/X86/X86CustomBehaviour.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/tools/llvm-mca/lib/X86/X86CustomBehaviour.cpp
@@ -0,0 +1,26 @@
+//===------------------- X86CustomBehaviour.cpp -----------------*-C++ -* -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements methods from the X86CustomBehaviour class.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86CustomBehaviour.h"
+
+namespace llvm {
+namespace mca {
+
+unsigned
+X86CustomBehaviour::checkCustomHazard(const SmallVector<InstRef, 4> &IssuedInst,
+                                      const InstRef &IR) {
+  return 0;
+}
+
+} // namespace mca
+} // namespace llvm
diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp
--- a/llvm/tools/llvm-mca/llvm-mca.cpp
+++ b/llvm/tools/llvm-mca/llvm-mca.cpp
@@ -32,6 +32,8 @@
 #include "Views/SchedulerStatistics.h"
 #include "Views/SummaryView.h"
 #include "Views/TimelineView.h"
+#include "lib/AMDGPU/AMDGPUCustomBehaviour.h"
+#include "lib/X86/X86CustomBehaviour.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeEmitter.h"
@@ -42,6 +44,7 @@
 #include "llvm/MC/MCTargetOptionsCommandFlags.h"
 #include "llvm/MCA/CodeEmitter.h"
 #include "llvm/MCA/Context.h"
+#include "llvm/MCA/CustomBehaviour.h"
 #include "llvm/MCA/InstrBuilder.h"
 #include "llvm/MCA/Pipeline.h"
 #include "llvm/MCA/Stages/EntryStage.h"
@@ -220,6 +223,12 @@
     cl::desc("Print encoding information in the instruction info view"),
     cl::cat(ViewOptions), cl::init(false));
 
+static cl::opt<bool> DisableCustomBehaviour(
+    "disable-cb",
+    cl::desc(
+        "Disable custom behaviour (use the default class which does nothing)."),
+    cl::cat(ViewOptions), cl::init(false));
+
 namespace {
 
 const Target *getTarget(const char *ProgName) {
@@ -285,6 +294,28 @@
     processOptionImpl(PrintRetireStats, Default);
 }
 
+std::unique_ptr<mca::CustomBehaviour>
+createCustomBehaviour(Triple &TheTriple, const MCSubtargetInfo &STI,
+                      const mca::SourceMgr &SrcMgr, const MCInstrInfo &MCII) {
+  // Build the appropriate CustomBehaviour object for the current target.
+  // The CustomBehaviour class should never depend on the source code,
+  // but it can depend on the list of mca::Instruction and any classes
+  // that can be built using just the target info. If you need extra
+  // information from the source code or the list of MCInst, consider
+  // adding that information to the mca::Instruction class and setting
+  // it during InstrBuilder::createInstruction().
+  if (DisableCustomBehaviour)
+    return std::make_unique<mca::CustomBehaviour>(STI, SrcMgr, MCII);
+  
+  if (TheTriple.isX86())
+    return std::make_unique<mca::X86CustomBehaviour>(STI, SrcMgr, MCII);
+  
+  if (TheTriple.isAMDGPU())
+    return std::make_unique<mca::AMDGPUCustomBehaviour>(STI, SrcMgr, MCII);
+  
+  return std::make_unique<mca::CustomBehaviour>(STI, SrcMgr, MCII);
+}
+
 // Returns true on success.
 static bool runPipeline(mca::Pipeline &P) {
   // Handle pipeline errors here.
@@ -547,8 +578,17 @@
       continue;
     }
 
+    // Create the CustomBehaviour object for enforcing Target Specific
+    // behaviours and dependencies that aren't expressed well enough
+    // in the tablegen. CB cannot depend on the list of MCInst or
+    // the source code (but it can depend on the list of
+    // mca::Instruction or any objects that can be reconstructed
+    // from the target information).
+    std::unique_ptr<mca::CustomBehaviour> CB =
+        createCustomBehaviour(TheTriple, *STI, S, *MCII);
+
     // Create a basic pipeline simulating an out-of-order backend.
-    auto P = MCA.createDefaultPipeline(PO, S);
+    auto P = MCA.createDefaultPipeline(PO, S, *CB);
     mca::PipelinePrinter Printer(*P, PrintJson ? mca::View::OK_JSON
                                                : mca::View::OK_READABLE);