Index: llvm/docs/CommandGuide/llvm-mca.rst
===================================================================
--- llvm/docs/CommandGuide/llvm-mca.rst
+++ llvm/docs/CommandGuide/llvm-mca.rst
@@ -212,6 +212,11 @@
   Print the requested views in JSON format. The instructions and the processor
   resources are printed as members of special top level JSON objects.  The
   individual views refer to them by index.
+  
+.. option:: -disable-cb
+
+  Force usage of the generic CustomBehaviour class rather than using the target
+  specific class. The generic class never detects any custom hazards.
 
 
 EXIT STATUS
@@ -978,3 +983,32 @@
 retire. :program:`llvm-mca` ensures that writes are committed in-order. However,
 an instruction is allowed to commit writes and retire out-of-order if
 ``RetireOOO`` property is true for at least one of its writes.
+
+Custom Behaviour
+""""""""""""""""""""""""""""""""""""
+Due to certain instructions not being expressed perfectly within their
+scheduling model, :program:`llvm-ma` isn't always able to simulate them
+perfectly. Modifying the scheduling model isn't always a viable
+option though (maybe because the instruction is modeled incorrectly on
+purpose or the instruction's behaviour is quite complex). The
+CustomBehaviour class can be used in these cases to enforce proper
+instruction modeling (often by customizing data dependencies and detecting
+hazards that :program:`llvm-ma` has no way of knowing about).
+
+:program:`llvm-mca` comes with one generic and multiple target specific
+CustomBehaviour classes. The generic class will be used if the ``-disable-cb``
+flag is used or if a target specific CustomBehaviour class doesn't exist for
+that target. (The generic class does nothing.) Currently, the CustomBehaviour
+class is only a part of the in-order pipeline, but there are plans to add it
+to the out-of-order pipeline in the future.
+
+CustomBehaviour's main method is `checkCustomHazard()` which uses the
+current instruction and a list of all instructions still executing within
+the pipeline to determine if the current instruction should be dispatched.
+As output, the method returns an integer representing the number of cycles
+that the current instruction must stall for (this can be an underestimate
+if you don't know the exact number and a value of 0 represents no stall).
+
+If you'd like to add a CustomBehaviour class for a target that doesn't
+already have one, refer to an existing implementation to see how to set it
+up. Remember to look at (and add to) `/llvm-mca/lib/CMakeLists.txt`.
Index: llvm/include/llvm/MCA/Context.h
===================================================================
--- llvm/include/llvm/MCA/Context.h
+++ llvm/include/llvm/MCA/Context.h
@@ -19,6 +19,7 @@
 
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MCA/CustomBehaviour.h"
 #include "llvm/MCA/HardwareUnits/HardwareUnit.h"
 #include "llvm/MCA/Pipeline.h"
 #include "llvm/MCA/SourceMgr.h"
@@ -67,12 +68,14 @@
   /// Construct a basic pipeline for simulating an out-of-order pipeline.
   /// This pipeline consists of Fetch, Dispatch, Execute, and Retire stages.
   std::unique_ptr<Pipeline> createDefaultPipeline(const PipelineOptions &Opts,
-                                                  SourceMgr &SrcMgr);
+                                                  SourceMgr &SrcMgr,
+                                                  CustomBehaviour &CB);
 
   /// Construct a basic pipeline for simulating an in-order pipeline.
   /// This pipeline consists of Fetch, InOrderIssue, and Retire stages.
   std::unique_ptr<Pipeline> createInOrderPipeline(const PipelineOptions &Opts,
-                                                  SourceMgr &SrcMgr);
+                                                  SourceMgr &SrcMgr,
+                                                  CustomBehaviour &CB);
 };
 
 } // namespace mca
Index: llvm/include/llvm/MCA/CustomBehaviour.h
===================================================================
--- /dev/null
+++ llvm/include/llvm/MCA/CustomBehaviour.h
@@ -0,0 +1,86 @@
+//===---------------------- CustomBehaviour.h -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the base class CustomBehaviour which can be inherited from
+/// by specific targets (ex. llvm/tools/llvm-mca/lib/X86CustomBehaviour.h).
+/// CustomBehaviour is designed to enforce custom behaviour and dependencies
+/// within the llvm-mca pipeline simulation that llvm-mca isn't already capable
+/// of extracting from the Scheduling Models.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MCA_CUSTOMBEHAVIOUR_H
+#define LLVM_MCA_CUSTOMBEHAVIOUR_H
+
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MCA/SourceMgr.h"
+
+namespace llvm {
+namespace mca {
+
+/// Class which can be overriden by targets to modify the
+/// mca::Instruction objects before the pipeline starts.
+/// A common usage of this class is to add immediate operands to certain
+/// instructions or to remove Defs/Uses from an instruction where the
+/// schedulinng model is incorrect.
+class InstrPostProcess {
+protected:
+  const MCSubtargetInfo &STI;
+  const MCInstrInfo &MCII;
+
+public:
+  InstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
+      : STI(STI), MCII(MCII) {}
+
+  virtual ~InstrPostProcess() {}
+
+  virtual void postProcessInstruction(std::unique_ptr<Instruction> &Inst,
+                                      const MCInst &MCI) {}
+};
+
+/// Class which can be overriden by targets to enforce instruction
+/// dependencies and behaviours that aren't expressed well enough
+/// within the scheduling model for mca to automatically simulate
+/// them properly.
+/// If you implement this class for your target, make sure to also implement
+/// a target specific InstrPostProcess class as well.
+class CustomBehaviour {
+protected:
+  const MCSubtargetInfo &STI;
+  const SourceMgr &SrcMgr;
+  const MCInstrInfo &MCII;
+
+public:
+  CustomBehaviour(const MCSubtargetInfo &STI, const SourceMgr &SrcMgr,
+                  const MCInstrInfo &MCII)
+      : STI(STI), SrcMgr(SrcMgr), MCII(MCII) {}
+
+  virtual ~CustomBehaviour() {}
+
+  // Before the llvm-mca pipeline dispatches an instruction, it first checks
+  // for any register or resource dependencies / hazards. If it doesn't find
+  // any, this method will be invoked to determine if there are any custom
+  // hazards that the instruction needs to wait for.
+  // The return value of this method is the number of cycles that the
+  // instruction needs to wait for.
+  // It's safe to underestimate the number of cycles to wait for since these
+  // checks will be invoked again before the intruction gets dispatched.
+  // However, it's not safe (accurate) to overestimate the number of cycles
+  // to wait for since the instruction will wait for AT LEAST that number of
+  // cycles before attempting to be dispatched again.
+  virtual unsigned checkCustomHazard(const SmallVector<InstRef, 4> &IssuedInst,
+                                     const InstRef &IR);
+};
+
+} // namespace mca
+} // namespace llvm
+
+#endif /* LLVM_MCA_CUSTOMBEHAVIOUR_H */
Index: llvm/include/llvm/MCA/HWEventListener.h
===================================================================
--- llvm/include/llvm/MCA/HWEventListener.h
+++ llvm/include/llvm/MCA/HWEventListener.h
@@ -115,6 +115,7 @@
     SchedulerQueueFull,
     LoadQueueFull,
     StoreQueueFull,
+    CustomBehaviourStall,
     LastGenericEvent
   };
 
Index: llvm/include/llvm/MCA/Instruction.h
===================================================================
--- llvm/include/llvm/MCA/Instruction.h
+++ llvm/include/llvm/MCA/Instruction.h
@@ -33,6 +33,104 @@
 
 constexpr int UNKNOWN_CYCLES = -512;
 
+/// A representation of an mca::Instruction operand
+/// for use in mca::CustomBehaviour.
+class MCAOperand {
+  // This class is mostly copied from MCOperand within
+  // MCInst.h except that we don't keep track of
+  // expressions or sub-instructions.
+  enum MCAOperandType : unsigned char {
+    kInvalid,   ///< Uninitialized, Relocatable immediate, or Sub-instruction.
+    kRegister,  ///< Register operand.
+    kImmediate, ///< Immediate operand.
+    kSFPImmediate, ///< Single-floating-point immediate operand.
+    kDFPImmediate, ///< Double-Floating-point immediate operand.
+  };
+  MCAOperandType Kind = kInvalid;
+
+  union {
+    unsigned RegVal;
+    int64_t ImmVal;
+    uint32_t SFPImmVal;
+    uint64_t FPImmVal;
+  };
+
+  // We only store specific operands for specific instructions
+  // so an instruction's operand 3 may be stored within the list
+  // of MCAOperand as element 0. This Index attribute keeps track
+  // of the original index (3 for this example).
+  unsigned Index;
+
+public:
+  MCAOperand() : FPImmVal(0) {}
+
+  bool isValid() const { return Kind != kInvalid; }
+  bool isReg() const { return Kind == kRegister; }
+  bool isImm() const { return Kind == kImmediate; }
+  bool isSFPImm() const { return Kind == kSFPImmediate; }
+  bool isDFPImm() const { return Kind == kDFPImmediate; }
+
+  /// Returns the register number.
+  unsigned getReg() const {
+    assert(isReg() && "This is not a register operand!");
+    return RegVal;
+  }
+
+  int64_t getImm() const {
+    assert(isImm() && "This is not an immediate");
+    return ImmVal;
+  }
+
+  uint32_t getSFPImm() const {
+    assert(isSFPImm() && "This is not an SFP immediate");
+    return SFPImmVal;
+  }
+
+  uint64_t getDFPImm() const {
+    assert(isDFPImm() && "This is not an FP immediate");
+    return FPImmVal;
+  }
+
+  void setIndex(const unsigned Idx) { Index = Idx; }
+
+  unsigned getIndex() const { return Index; }
+
+  static MCAOperand createReg(unsigned Reg) {
+    MCAOperand Op;
+    Op.Kind = kRegister;
+    Op.RegVal = Reg;
+    return Op;
+  }
+
+  static MCAOperand createImm(int64_t Val) {
+    MCAOperand Op;
+    Op.Kind = kImmediate;
+    Op.ImmVal = Val;
+    return Op;
+  }
+
+  static MCAOperand createSFPImm(uint32_t Val) {
+    MCAOperand Op;
+    Op.Kind = kSFPImmediate;
+    Op.SFPImmVal = Val;
+    return Op;
+  }
+
+  static MCAOperand createDFPImm(uint64_t Val) {
+    MCAOperand Op;
+    Op.Kind = kDFPImmediate;
+    Op.FPImmVal = Val;
+    return Op;
+  }
+
+  static MCAOperand createInvalid() {
+    MCAOperand Op;
+    Op.Kind = kInvalid;
+    Op.FPImmVal = 0;
+    return Op;
+  }
+};
+
 /// A register write descriptor.
 struct WriteDescriptor {
   // Operand index. The index is negative for implicit writes only.
@@ -160,6 +258,7 @@
   int getCyclesLeft() const { return CyclesLeft; }
   unsigned getWriteResourceID() const { return WD->SClassOrWriteResourceID; }
   MCPhysReg getRegisterID() const { return RegisterID; }
+  void setRegisterID(const MCPhysReg RegID) { RegisterID = RegID; }
   unsigned getRegisterFileID() const { return PRFID; }
   unsigned getLatency() const { return WD->Latency; }
   unsigned getDependentWriteCyclesLeft() const {
@@ -409,8 +508,15 @@
   // One entry per each implicit and explicit register use.
   SmallVector<ReadState, 4> Uses;
 
+  // List of operands which can be used by mca::CustomBehaviour
+  std::vector<MCAOperand> Operands;
+
+  // Instruction opcode which can be used by mca::CustomBehaviour
+  unsigned Opcode;
+
 public:
-  InstructionBase(const InstrDesc &D) : Desc(D), IsOptimizableMove(false) {}
+  InstructionBase(const InstrDesc &D, const unsigned Opcode)
+      : Desc(D), IsOptimizableMove(false), Operands(0), Opcode(Opcode) {}
 
   SmallVectorImpl<WriteState> &getDefs() { return Defs; }
   ArrayRef<WriteState> getDefs() const { return Defs; }
@@ -420,6 +526,20 @@
 
   unsigned getLatency() const { return Desc.MaxLatency; }
   unsigned getNumMicroOps() const { return Desc.NumMicroOps; }
+  unsigned getOpcode() const { return Opcode; }
+
+  /// Return the MCAOperand which corresponds to index Idx within the original
+  /// MCInst.
+  const MCAOperand *getOperand(const unsigned Idx) const {
+    auto It = std::find_if(
+        Operands.begin(), Operands.end(),
+        [&Idx](const MCAOperand &Op) { return Op.getIndex() == Idx; });
+    if (It == Operands.end())
+      return nullptr;
+    return &(*It);
+  }
+  unsigned getNumOperands() const { return Operands.size(); }
+  void addOperand(const MCAOperand Op) { Operands.push_back(Op); }
 
   bool hasDependentUsers() const {
     return any_of(Defs,
@@ -490,11 +610,11 @@
   bool IsEliminated;
 
 public:
-  Instruction(const InstrDesc &D)
-      : InstructionBase(D), Stage(IS_INVALID), CyclesLeft(UNKNOWN_CYCLES),
-        RCUTokenID(0), LSUTokenID(0), UsedBuffers(D.UsedBuffers),
-        CriticalRegDep(), CriticalMemDep(), CriticalResourceMask(0),
-        IsEliminated(false) {}
+  Instruction(const InstrDesc &D, const unsigned Opcode)
+      : InstructionBase(D, Opcode), Stage(IS_INVALID),
+        CyclesLeft(UNKNOWN_CYCLES), RCUTokenID(0), LSUTokenID(0),
+        UsedBuffers(D.UsedBuffers), CriticalRegDep(), CriticalMemDep(),
+        CriticalResourceMask(0), IsEliminated(false) {}
 
   unsigned getRCUTokenID() const { return RCUTokenID; }
   unsigned getLSUTokenID() const { return LSUTokenID; }
Index: llvm/include/llvm/MCA/Stages/InOrderIssueStage.h
===================================================================
--- llvm/include/llvm/MCA/Stages/InOrderIssueStage.h
+++ llvm/include/llvm/MCA/Stages/InOrderIssueStage.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_MCA_STAGES_INORDERISSUESTAGE_H
 #define LLVM_MCA_STAGES_INORDERISSUESTAGE_H
 
+#include "llvm/MCA/CustomBehaviour.h"
 #include "llvm/MCA/HardwareUnits/ResourceManager.h"
 #include "llvm/MCA/SourceMgr.h"
 #include "llvm/MCA/Stages/Stage.h"
@@ -23,7 +24,13 @@
 class RegisterFile;
 
 struct StallInfo {
-  enum class StallKind { DEFAULT, REGISTER_DEPS, DISPATCH, DELAY };
+  enum class StallKind {
+    DEFAULT,
+    REGISTER_DEPS,
+    DISPATCH,
+    DELAY,
+    CUSTOM_STALL
+  };
 
   InstRef IR;
   unsigned CyclesLeft;
@@ -46,6 +53,7 @@
   const MCSubtargetInfo &STI;
   RegisterFile &PRF;
   ResourceManager RM;
+  CustomBehaviour &CB;
 
   /// Instructions that were issued, but not executed yet.
   SmallVector<InstRef, 4> IssuedInst;
@@ -101,7 +109,8 @@
   void retireInstruction(InstRef &IR);
 
 public:
-  InOrderIssueStage(const MCSubtargetInfo &STI, RegisterFile &PRF);
+  InOrderIssueStage(const MCSubtargetInfo &STI, RegisterFile &PRF,
+                    CustomBehaviour &CB);
 
   unsigned getIssueWidth() const;
   bool isAvailable(const InstRef &) const override;
Index: llvm/lib/MCA/CMakeLists.txt
===================================================================
--- llvm/lib/MCA/CMakeLists.txt
+++ llvm/lib/MCA/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_llvm_component_library(LLVMMCA
   CodeEmitter.cpp
   Context.cpp
+  CustomBehaviour.cpp
   HWEventListener.cpp
   HardwareUnits/HardwareUnit.cpp
   HardwareUnits/LSUnit.cpp
Index: llvm/lib/MCA/Context.cpp
===================================================================
--- llvm/lib/MCA/Context.cpp
+++ llvm/lib/MCA/Context.cpp
@@ -29,11 +29,12 @@
 namespace mca {
 
 std::unique_ptr<Pipeline>
-Context::createDefaultPipeline(const PipelineOptions &Opts, SourceMgr &SrcMgr) {
+Context::createDefaultPipeline(const PipelineOptions &Opts, SourceMgr &SrcMgr,
+                               CustomBehaviour &CB) {
   const MCSchedModel &SM = STI.getSchedModel();
 
   if (!SM.isOutOfOrder())
-    return createInOrderPipeline(Opts, SrcMgr);
+    return createInOrderPipeline(Opts, SrcMgr, CB);
 
   // Create the hardware units defining the backend.
   auto RCU = std::make_unique<RetireControlUnit>(SM);
@@ -69,13 +70,14 @@
 }
 
 std::unique_ptr<Pipeline>
-Context::createInOrderPipeline(const PipelineOptions &Opts, SourceMgr &SrcMgr) {
+Context::createInOrderPipeline(const PipelineOptions &Opts, SourceMgr &SrcMgr,
+                               CustomBehaviour &CB) {
   const MCSchedModel &SM = STI.getSchedModel();
   auto PRF = std::make_unique<RegisterFile>(SM, MRI, Opts.RegisterFileSize);
 
   // Create the pipeline stages.
   auto Entry = std::make_unique<EntryStage>(SrcMgr);
-  auto InOrderIssue = std::make_unique<InOrderIssueStage>(STI, *PRF);
+  auto InOrderIssue = std::make_unique<InOrderIssueStage>(STI, *PRF, CB);
   auto StagePipeline = std::make_unique<Pipeline>();
 
   // Pass the ownership of all the hardware units to this Context.
Index: llvm/lib/MCA/CustomBehaviour.cpp
===================================================================
--- /dev/null
+++ llvm/lib/MCA/CustomBehaviour.cpp
@@ -0,0 +1,27 @@
+//===--------------------- CustomBehaviour.cpp ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements methods from the CustomBehaviour interface.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MCA/CustomBehaviour.h"
+
+namespace llvm {
+namespace mca {
+
+unsigned
+CustomBehaviour::checkCustomHazard(const SmallVector<InstRef, 4> &IssuedInst,
+                                   const InstRef &IR) {
+  // 0 signifies that there are no hazards that need to be waited on
+  return 0;
+}
+
+} // namespace mca
+} // namespace llvm
Index: llvm/lib/MCA/HardwareUnits/RegisterFile.cpp
===================================================================
--- llvm/lib/MCA/HardwareUnits/RegisterFile.cpp
+++ llvm/lib/MCA/HardwareUnits/RegisterFile.cpp
@@ -109,7 +109,10 @@
       return;
 
     MCPhysReg RegID = WS.getRegisterID();
-    assert(RegID != 0 && "A write of an invalid register?");
+    // If RegID is 0, it means we removed this Def in post processing.
+    if (!RegID)
+      continue;
+
     assert(WS.getCyclesLeft() != UNKNOWN_CYCLES &&
            "The number of cycles should be known at this point!");
     assert(WS.getCyclesLeft() <= 0 && "Invalid cycles left for this write!");
@@ -224,7 +227,10 @@
                                     MutableArrayRef<unsigned> UsedPhysRegs) {
   WriteState &WS = *Write.getWriteState();
   MCPhysReg RegID = WS.getRegisterID();
-  assert(RegID && "Adding an invalid register definition?");
+
+  // If RegID is 0, it means we removed this Def in post processing.
+  if (!RegID)
+    return;
 
   LLVM_DEBUG({
     dbgs() << "[PRF] addRegisterWrite [ " << Write.getSourceIndex() << ", "
@@ -316,7 +322,10 @@
 
   MCPhysReg RegID = WS.getRegisterID();
 
-  assert(RegID != 0 && "Invalidating an already invalid register?");
+  // If RegID is 0, it means we removed this Def in post processing.
+  if (!RegID)
+    return;
+
   assert(WS.getCyclesLeft() != UNKNOWN_CYCLES &&
          "Invalidating a write of unknown cycles!");
   assert(WS.getCyclesLeft() <= 0 && "Invalid cycles left for this write!");
Index: llvm/lib/MCA/InstrBuilder.cpp
===================================================================
--- llvm/lib/MCA/InstrBuilder.cpp
+++ llvm/lib/MCA/InstrBuilder.cpp
@@ -616,7 +616,8 @@
   if (!DescOrErr)
     return DescOrErr.takeError();
   const InstrDesc &D = *DescOrErr;
-  std::unique_ptr<Instruction> NewIS = std::make_unique<Instruction>(D);
+  std::unique_ptr<Instruction> NewIS =
+      std::make_unique<Instruction>(D, MCI.getOpcode());
 
   // Check if this is a dependency breaking instruction.
   APInt Mask;
Index: llvm/lib/MCA/Stages/InOrderIssueStage.cpp
===================================================================
--- llvm/lib/MCA/Stages/InOrderIssueStage.cpp
+++ llvm/lib/MCA/Stages/InOrderIssueStage.cpp
@@ -43,8 +43,8 @@
 }
 
 InOrderIssueStage::InOrderIssueStage(const MCSubtargetInfo &STI,
-                                     RegisterFile &PRF)
-    : STI(STI), PRF(PRF), RM(STI.getSchedModel()), NumIssued(), SI(),
+                                     RegisterFile &PRF, CustomBehaviour &CB)
+    : STI(STI), PRF(PRF), RM(STI.getSchedModel()), CB(CB), NumIssued(), SI(),
       CarryOver(), Bandwidth(), LastWriteBackCycle() {}
 
 unsigned InOrderIssueStage::getIssueWidth() const {
@@ -125,6 +125,11 @@
     return false;
   }
 
+  if (unsigned CustomStallCycles = CB.checkCustomHazard(IssuedInst, IR)) {
+    SI.update(IR, CustomStallCycles, StallInfo::StallKind::CUSTOM_STALL);
+    return false;
+  }
+
   if (LastWriteBackCycle) {
     if (!IR.getInstruction()->getDesc().RetireOOO) {
       unsigned NextWriteBackCycle = findFirstWriteBackCycle(IR);
@@ -333,6 +338,11 @@
         HWPressureEvent(HWPressureEvent::RESOURCES, IR));
     break;
   }
+  case StallInfo::StallKind::CUSTOM_STALL: {
+    notifyEvent<HWStallEvent>(
+        HWStallEvent(HWStallEvent::CustomBehaviourStall, IR));
+    break;
+  }
   }
 }
 
Index: llvm/lib/Target/AMDGPU/SISchedule.td
===================================================================
--- llvm/lib/Target/AMDGPU/SISchedule.td
+++ llvm/lib/Target/AMDGPU/SISchedule.td
@@ -20,6 +20,7 @@
 def WriteExport : SchedWrite;
 def WriteLDS    : SchedWrite;
 def WriteSALU   : SchedWrite;
+def WriteCBSALU : SchedWrite; // RetireOOO flag for llvm-mca (s_waitcnt instrs)
 def WriteSMEM   : SchedWrite;
 def WriteVMEM   : SchedWrite;
 def WriteBarrier : SchedWrite;
@@ -256,10 +257,13 @@
 def : HWWriteRes<WriteExport,        [HWExport, HWRC], 16>;
 def : HWWriteRes<WriteLDS,           [HWLGKM,   HWRC], 20>;
 def : HWWriteRes<WriteSALU,          [HWSALU,   HWRC], 2>;
+let RetireOOO = 1 in
+def : HWWriteRes<WriteCBSALU,        [HWSALU,   HWRC], 2>;
 def : HWWriteRes<WriteSMEM,          [HWLGKM,   HWRC], 20>;
 def : HWWriteRes<WriteVMEM,          [HWVMEM,   HWRC], 320>;
 def : HWWriteRes<WriteBarrier,       [HWBranch],       2000>;
 
 def : InstRW<[WriteCopy], (instrs COPY)>;
+def : InstRW<[WriteCBSALU], (instregex "S_WAITCNT.*")>;
 
 }  // End SchedModel = GFX10SpeedModel
Index: llvm/test/tools/llvm-mca/AMDGPU/gfx10-waitcnt-custom-behaviour.s
===================================================================
--- /dev/null
+++ llvm/test/tools/llvm-mca/AMDGPU/gfx10-waitcnt-custom-behaviour.s
@@ -0,0 +1,10 @@
+# RUN: llvm-mca -mtriple=amdgcn -mcpu=gfx1010 -dispatch-stats=false -scheduler-stats=false < %s | FileCheck %s
+
+s_load_dwordx4 s[0:3], s[4:5], 0x0
+v_mov_b32_e32 v0, 0
+s_waitcnt vmcnt(0) lgkmcnt(0)
+s_waitcnt_vscnt null, 0x0
+global_load_dword v1, v0, s[0:1] glc dlc
+s_waitcnt vmcnt(0)
+
+# CHECK: Total Cycles:      34302
Index: llvm/tools/llvm-mca/CMakeLists.txt
===================================================================
--- llvm/tools/llvm-mca/CMakeLists.txt
+++ llvm/tools/llvm-mca/CMakeLists.txt
@@ -1,5 +1,7 @@
 include_directories(include)
 
+add_subdirectory(lib)
+
 set(LLVM_LINK_COMPONENTS
   AllTargetsAsmParsers
   AllTargetsDescs
@@ -30,3 +32,7 @@
   )
 
 set(LLVM_MCA_SOURCE_DIR ${CURRENT_SOURCE_DIR})
+
+target_link_libraries(llvm-mca PRIVATE
+  ${LLVM_MCA_CUSTOMBEHAVIOUR_TARGETS}
+  )
Index: llvm/tools/llvm-mca/Views/DispatchStatistics.cpp
===================================================================
--- llvm/tools/llvm-mca/Views/DispatchStatistics.cpp
+++ llvm/tools/llvm-mca/Views/DispatchStatistics.cpp
@@ -77,6 +77,8 @@
   printStalls(SS, HWStalls[HWStallEvent::StoreQueueFull], NumCycles);
   SS << "\nGROUP   - Static restrictions on the dispatch group: ";
   printStalls(SS, HWStalls[HWStallEvent::DispatchGroupStall], NumCycles);
+  SS << "\nUSH     - Uncategorised Structural Hazard:           ";
+  printStalls(SS, HWStalls[HWStallEvent::CustomBehaviourStall], NumCycles);
   SS << '\n';
   SS.flush();
   OS << Buffer;
Index: llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.h
===================================================================
--- /dev/null
+++ llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.h
@@ -0,0 +1,110 @@
+//===------------------- AMDGPUCustomBehaviour.h ----------------*-C++ -* -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the AMDGPUCustomBehaviour class which inherits from
+/// CustomBehaviour.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_LIB_AMDGPU_AMDGPUCUSTOMBEHAVIOUR_H
+#define LLVM_TOOLS_LLVM_MCA_LIB_AMDGPU_AMDGPUCUSTOMBEHAVIOUR_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MCA/CustomBehaviour.h"
+#include "llvm/Support/TargetParser.h"
+
+namespace llvm {
+namespace mca {
+
+class AMDGPUInstrPostProcess : public InstrPostProcess {
+  void processWaitCnt(std::unique_ptr<Instruction> &Inst, const MCInst &MCI);
+
+public:
+  AMDGPUInstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
+      : InstrPostProcess(STI, MCII) {}
+
+  ~AMDGPUInstrPostProcess() {}
+
+  void postProcessInstruction(std::unique_ptr<Instruction> &Inst,
+                              const MCInst &MCI) override;
+};
+
+struct WaitCntInfo {
+  bool VmCnt = false;
+  bool ExpCnt = false;
+  bool LgkmCnt = false;
+  bool VsCnt = false;
+};
+
+class AMDGPUCustomBehaviour : public CustomBehaviour {
+  /// Whenever MCA would like to dispatch an s_waitcnt instructions,
+  /// we must check all the instruction that are still executing to see if
+  /// they modify the same CNT as we need to wait for. This vector
+  /// gets built in the constructor and contains 1 WaitCntInfo struct
+  /// for each instruction within the SrcManager. Each element
+  /// tells us which CNTs that instruction may interact with.
+  /// We conservatively assume some instructions interact with more
+  /// CNTs than they do in reality, so we will occasionally wait
+  /// longer than necessary, but we shouldn't ever wait for shorter.
+  std::vector<WaitCntInfo> InstrWaitCntInfo;
+
+  /// This method gets called from the constructor and is
+  /// where we setup the InstrWaitCntInfo vector.
+  /// The core logic for determining which CNTs an instruction
+  /// interacts with is taken from SIInsertWaitcnts::updateEventWaitcntAfter().
+  /// Unfortunately, some of the logic from that function is not avalable to us
+  /// in this scope so we conservatively end up assuming that some
+  /// instructions interact with more CNTs than they do in reality.
+  void generateWaitCntInfo();
+  /// Helper function used in generateWaitCntInfo()
+  bool hasModifiersSet(const std::unique_ptr<Instruction> &Inst,
+                       unsigned OpName) const;
+  /// Helper function used in generateWaitCntInfo()
+  bool isAlwaysGDS(uint16_t Opcode) const;
+  /// Helper function used in generateWaitCntInfo()
+  bool isVMEM(const MCInstrDesc &MCID);
+  /// This method gets called from checkCustomHazard when mca is attempting to
+  /// dispatch an s_waitcnt instruction (or one of its variants). The method
+  /// looks at each of the instructions that are still executing in the pipeline
+  /// to determine if the waitcnt should force a wait.
+  unsigned handleWaitCnt(const SmallVector<InstRef, 4> &IssuedInst,
+                         const InstRef &IR);
+  /// Based on the type of s_waitcnt instruction we are looking at, and what its
+  /// operands are, this method will set the values for each of the cnt
+  /// references provided as arguments.
+  void computeWaitCnt(const InstRef &IR, unsigned &Vmcnt, unsigned &Expcnt,
+                      unsigned &Lgkmcnt, unsigned &Vscnt);
+
+  /// Some instructions aren't automatically detected as interacting with any of
+  /// the CNTs even though they should. This method can be used to manually set
+  /// the WaitCntInfo for those instructions.
+  bool manuallySetWaitCntInfo(const std::unique_ptr<Instruction> &Inst,
+                              const int Index);
+
+public:
+  AMDGPUCustomBehaviour(const MCSubtargetInfo &STI, const SourceMgr &SrcMgr,
+                        const MCInstrInfo &MCII);
+
+  ~AMDGPUCustomBehaviour() {}
+
+  /// This method is used to determine if an instruction
+  /// should be allowed to be dispatched. The return value is
+  /// how many cycles until the instruction can be dispatched.
+  /// This method is called after MCA has already checked for
+  /// register and hardware dependencies so this method should only
+  /// implement custom behaviour and dependencies that are not picked up
+  /// by MCA naturally.
+  unsigned checkCustomHazard(const SmallVector<InstRef, 4> &IssuedInst,
+                             const InstRef &IR) override;
+};
+
+} // namespace mca
+} // namespace llvm
+
+#endif /* LLVM_TOOLS_LLVM_MCA_LIB_AMDGPU_AMDGPUCUSTOMBEHAVIOUR_H */
Index: llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.cpp
===================================================================
--- /dev/null
+++ llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.cpp
@@ -0,0 +1,458 @@
+//===------------------ AMDGPUCustomBehaviour.cpp ---------------*-C++ -* -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements methods from the AMDGPUCustomBehaviour class.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUCustomBehaviour.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/Support/WithColor.h"
+
+namespace llvm {
+namespace mca {
+
+void AMDGPUInstrPostProcess::postProcessInstruction(
+    std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
+  switch (MCI.getOpcode()) {
+  case AMDGPU::S_WAITCNT:
+  case AMDGPU::S_WAITCNT_DEPCTR:
+  case AMDGPU::S_WAITCNT_EXPCNT:
+  case AMDGPU::S_WAITCNT_LGKMCNT:
+  case AMDGPU::S_WAITCNT_VMCNT:
+  case AMDGPU::S_WAITCNT_VSCNT:
+  // case AMDGPU::S_WAITCNT_DEPCTR_gfx10: Not sure how this instruction works
+  case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
+  case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
+  case AMDGPU::S_WAITCNT_VMCNT_gfx10:
+  case AMDGPU::S_WAITCNT_VSCNT_gfx10:
+  case AMDGPU::S_WAITCNT_gfx10:
+  case AMDGPU::S_WAITCNT_gfx6_gfx7:
+  case AMDGPU::S_WAITCNT_vi:
+    return processWaitCnt(Inst, MCI);
+  }
+}
+
+// s_waitcnt instructions encode important information as immediate operands
+// which are lost during the MCInst -> mca::Instruction lowering.
+void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst,
+                                            const MCInst &MCI) {
+  for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) {
+    MCAOperand Op;
+    const MCOperand &MCOp = MCI.getOperand(Idx);
+    if (MCOp.isReg()) {
+      Op = MCAOperand::createReg(MCOp.getReg());
+    } else if (MCOp.isImm()) {
+      Op = MCAOperand::createImm(MCOp.getImm());
+    }
+    Op.setIndex(Idx);
+    Inst->addOperand(Op);
+  }
+}
+
+AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
+                                             const SourceMgr &SrcMgr,
+                                             const MCInstrInfo &MCII)
+    : CustomBehaviour(STI, SrcMgr, MCII) {
+  generateWaitCntInfo();
+}
+
+unsigned AMDGPUCustomBehaviour::checkCustomHazard(
+    const SmallVector<InstRef, 4> &IssuedInst, const InstRef &IR) {
+  const Instruction &Inst = *IR.getInstruction();
+  unsigned Opcode = Inst.getOpcode();
+
+  switch (Opcode) {
+  default:
+    return 0;
+  case AMDGPU::S_WAITCNT: // This instruction
+  case AMDGPU::S_WAITCNT_DEPCTR:
+  case AMDGPU::S_WAITCNT_EXPCNT:
+  case AMDGPU::S_WAITCNT_LGKMCNT:
+  case AMDGPU::S_WAITCNT_VMCNT:
+  case AMDGPU::S_WAITCNT_VSCNT: // to this instruction are all pseudo.
+  // case AMDGPU::S_WAITCNT_DEPCTR_gfx10: Not sure how this instruction works
+  case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
+  case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
+  case AMDGPU::S_WAITCNT_VMCNT_gfx10:
+  case AMDGPU::S_WAITCNT_VSCNT_gfx10:
+  case AMDGPU::S_WAITCNT_gfx10:
+  case AMDGPU::S_WAITCNT_gfx6_gfx7:
+  case AMDGPU::S_WAITCNT_vi:
+    // s_endpgm also behaves as if there is an implicit
+    // s_waitcnt 0, but I'm not sure if it would be appropriate
+    // to model this in llvm-mca based on how the iterations work
+    // while simulating the pipeline over and over.
+    return handleWaitCnt(IssuedInst, IR);
+  }
+
+  return 0;
+}
+
+unsigned
+AMDGPUCustomBehaviour::handleWaitCnt(const SmallVector<InstRef, 4> &IssuedInst,
+                                     const InstRef &IR) {
+  // set the max values to begin
+  unsigned Vmcnt = 63;
+  unsigned Expcnt = 7;
+  unsigned Lgkmcnt = 31;
+  unsigned Vscnt = 63;
+  unsigned CurrVmcnt = 0;
+  unsigned CurrExpcnt = 0;
+  unsigned CurrLgkmcnt = 0;
+  unsigned CurrVscnt = 0;
+  unsigned CyclesToWaitVm = ~0U;
+  unsigned CyclesToWaitExp = ~0U;
+  unsigned CyclesToWaitLgkm = ~0U;
+  unsigned CyclesToWaitVs = ~0U;
+
+  computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt);
+
+  // We will now look at each of the currently executing instructions
+  // to find out if this wait instruction still needs to wait.
+  for (auto I = IssuedInst.begin(), E = IssuedInst.end(); I != E; I++) {
+    const InstRef &PrevIR = *I;
+    const Instruction &PrevInst = *PrevIR.getInstruction();
+    const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size();
+    const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex];
+    const int CyclesLeft = PrevInst.getCyclesLeft();
+    assert(CyclesLeft != UNKNOWN_CYCLES &&
+           "We should know how many cycles are left for this instruction");
+    if (PrevInstWaitInfo.VmCnt) {
+      CurrVmcnt++;
+      if ((unsigned)CyclesLeft < CyclesToWaitVm)
+        CyclesToWaitVm = CyclesLeft;
+    }
+    if (PrevInstWaitInfo.ExpCnt) {
+      CurrExpcnt++;
+      if ((unsigned)CyclesLeft < CyclesToWaitExp)
+        CyclesToWaitExp = CyclesLeft;
+    }
+    if (PrevInstWaitInfo.LgkmCnt) {
+      CurrLgkmcnt++;
+      if ((unsigned)CyclesLeft < CyclesToWaitLgkm)
+        CyclesToWaitLgkm = CyclesLeft;
+    }
+    if (PrevInstWaitInfo.VsCnt) {
+      CurrVscnt++;
+      if ((unsigned)CyclesLeft < CyclesToWaitVs)
+        CyclesToWaitVs = CyclesLeft;
+    }
+  }
+
+  unsigned CyclesToWait = ~0U;
+  if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait)
+    CyclesToWait = CyclesToWaitVm;
+  if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait)
+    CyclesToWait = CyclesToWaitExp;
+  if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait)
+    CyclesToWait = CyclesToWaitLgkm;
+  if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait)
+    CyclesToWait = CyclesToWaitVs;
+
+  // We may underestimate how many cycles we need to wait, but this
+  // isn't a big deal. Our return value is just how many cycles until
+  // this function gets run again. So as long as we don't overestimate
+  // the wait time, we'll still end up stalling at this instruction
+  // for the correct number of cycles.
+
+  if (CyclesToWait == ~0U)
+    return 0;
+  return CyclesToWait;
+}
+
+void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt,
+                                           unsigned &Expcnt, unsigned &Lgkmcnt,
+                                           unsigned &Vscnt) {
+  AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
+  const Instruction &Inst = *IR.getInstruction();
+  unsigned Opcode = Inst.getOpcode();
+
+  switch (Opcode) {
+  case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
+  case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
+  case AMDGPU::S_WAITCNT_VMCNT_gfx10:
+  case AMDGPU::S_WAITCNT_VSCNT_gfx10: {
+    // Should probably be checking for nullptr
+    // here, but I'm not sure how I should handle the case
+    // where we see a nullptr.
+    const MCAOperand *OpReg = Inst.getOperand(0);
+    const MCAOperand *OpImm = Inst.getOperand(1);
+    assert(OpReg->isReg() && "First operand should be a register.");
+    assert(OpImm->isImm() && "Second operand should be an immediate.");
+    if (OpReg->getReg() != AMDGPU::SGPR_NULL) {
+      // Instruction is using a real register.
+      // Since we can't know what value this register will have,
+      // we can't compute what the value of this wait should be.
+      WithColor::warning() << "The register component of "
+                           << MCII.getName(Opcode) << " will be completely "
+                           << "ignored. So the wait may not be accurate.\n";
+    }
+    switch (Opcode) {
+    // Redundant switch so I don't have to repeat the code above
+    // for each case. There are more clever ways to avoid this
+    // extra switch and anyone can feel free to implement one of them.
+    case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
+      Expcnt = OpImm->getImm();
+      break;
+    case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
+      Lgkmcnt = OpImm->getImm();
+      break;
+    case AMDGPU::S_WAITCNT_VMCNT_gfx10:
+      Vmcnt = OpImm->getImm();
+      break;
+    case AMDGPU::S_WAITCNT_VSCNT_gfx10:
+      Vscnt = OpImm->getImm();
+      break;
+    }
+    return;
+  }
+  case AMDGPU::S_WAITCNT_gfx10:
+  case AMDGPU::S_WAITCNT_gfx6_gfx7:
+  case AMDGPU::S_WAITCNT_vi:
+    unsigned WaitCnt = Inst.getOperand(0)->getImm();
+    AMDGPU::decodeWaitcnt(IV, WaitCnt, Vmcnt, Expcnt, Lgkmcnt);
+    return;
+  }
+}
+
+void AMDGPUCustomBehaviour::generateWaitCntInfo() {
+  // The core logic from this function is taken from
+  // SIInsertWaitcnts::updateEventWaitcntAfter()
+  // In that pass, some instructions that are being looked at are
+  // Pseudo instructions. The Pseudo instructions have the proper
+  // mayLoad and mayStore flags associated with them, but the
+  // 'real' instructions that we encounter in this function
+  // may not have their mayLoad and mayStore flags set.
+  // For example, S_LOAD_DWORDX2_IMM (op 2293) will be seen
+  // in SIInsertWaitcnts.cpp, but after llc finishes compiling
+  // the assembly source, the instruction will become
+  // S_LOAD_DWORDX2_IMM_si (op 14208).
+  // S_LOAD_DWORDX2_IMM has the mayLoad flag.
+  // S_LOAD_DWORDX2_IMM_si does not have the mayLoad flag.
+  // For this reason, the control flow has been modified in this
+  // function to conservatively assume that some instructions
+  // interact with more CNTs than they should. This will
+  // result in occasionally waiting longer than necessary
+  // at a s_waitcnt instruction.
+  InstrWaitCntInfo.resize(SrcMgr.size());
+
+  int Index = 0;
+  for (auto I = SrcMgr.begin(), E = SrcMgr.end(); I != E; ++I, ++Index) {
+    const std::unique_ptr<Instruction> &Inst = *I;
+    // Due to the flags of 'real' instructions not being set properly,
+    // some instructions are (incorrectly) not detected as interacting
+    // with any CNTs. This function will manually handle those instructions
+    // that we are aware of.
+    if (manuallySetWaitCntInfo(Inst, Index))
+      continue;
+    unsigned Opcode = Inst->getOpcode();
+    const MCInstrDesc &MCID = MCII.get(Opcode);
+    if ((MCID.TSFlags & SIInstrFlags::DS) &&
+        (MCID.TSFlags & SIInstrFlags::LGKM_CNT)) {
+      InstrWaitCntInfo[Index].LgkmCnt = true;
+      if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, AMDGPU::OpName::gds))
+        InstrWaitCntInfo[Index].ExpCnt = true;
+    } else if (MCID.TSFlags & SIInstrFlags::FLAT) {
+      // We conservatively assume that mayAccessVMEMThroughFlat(Inst)
+      // and mayAccessLDSThroughFlat(Inst) would both return true for this
+      // instruction.
+      InstrWaitCntInfo[Index].LgkmCnt = true;
+      if (!STI.hasFeature(AMDGPU::FeatureVscnt))
+        InstrWaitCntInfo[Index].VmCnt = true;
+      else if (!(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) {
+        // else if (MCID.mayLoad() && !(MCID.TSFlags &
+        // SIInstrFlags::IsAtomicNoRet))
+        //   InstrWaitCntInfo[Index].VmCnt = true;
+        InstrWaitCntInfo[Index].VmCnt = true;
+        InstrWaitCntInfo[Index].VsCnt = true;
+      } else
+        InstrWaitCntInfo[Index].VsCnt = true;
+    } else if (isVMEM(MCID) && Opcode != AMDGPU::BUFFER_WBINVL1 &&
+               Opcode != AMDGPU::BUFFER_WBINVL1_SC &&
+               Opcode != AMDGPU::BUFFER_WBINVL1_VOL &&
+               Opcode != AMDGPU::BUFFER_GL0_INV &&
+               Opcode != AMDGPU::BUFFER_GL1_INV) {
+      if (!STI.hasFeature(AMDGPU::FeatureVscnt))
+        InstrWaitCntInfo[Index].VmCnt = true;
+      else if (!(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet) ||
+               (MCID.TSFlags & SIInstrFlags::MIMG)) {
+        // else if ((MCID.mayLoad() && !(MCID.TSFlags &
+        // SIInstrFlags::IsAtomicNoRet)) ||
+        //       (MCID.TSFlags & SIInstrFlags::MIMG &&
+        //        !MCID.mayLoad() && !MCID.mayStore()))
+        //  InstrWaitCntInfo[Index].VmCnt = true;
+        InstrWaitCntInfo[Index].VmCnt = true;
+        InstrWaitCntInfo[Index].VsCnt = true;
+      } else
+        // else if (MCID.mayStore())
+        InstrWaitCntInfo[Index].VsCnt = true;
+
+      // Conservatively assume that
+      // GCNSubtarget::vmemWriteNeedsExpWaitcnt() would return true
+
+      // if (MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet))
+      InstrWaitCntInfo[Index].ExpCnt = true;
+    } else if (MCID.TSFlags & SIInstrFlags::SMRD) {
+      InstrWaitCntInfo[Index].LgkmCnt = true;
+    } else if (MCID.TSFlags & SIInstrFlags::EXP) {
+      InstrWaitCntInfo[Index].ExpCnt = true;
+    } else {
+      switch (Opcode) {
+      case AMDGPU::S_SENDMSG:
+      case AMDGPU::S_SENDMSGHALT:
+      case AMDGPU::S_MEMTIME:
+      case AMDGPU::S_MEMREALTIME:
+        InstrWaitCntInfo[Index].LgkmCnt = true;
+        break;
+      }
+    }
+  }
+}
+
+bool AMDGPUCustomBehaviour::manuallySetWaitCntInfo(
+    const std::unique_ptr<Instruction> &Inst, const int Index) {
+  switch (Inst->getOpcode()) {
+  // scroll wheel workout
+  case AMDGPU::DS_READ2ST64_B32_gfx10:
+  case AMDGPU::DS_READ2ST64_B32_gfx6_gfx7:
+  case AMDGPU::DS_READ2ST64_B32_vi:
+  case AMDGPU::DS_READ2ST64_B64_gfx10:
+  case AMDGPU::DS_READ2ST64_B64_gfx6_gfx7:
+  case AMDGPU::DS_READ2ST64_B64_vi:
+  case AMDGPU::DS_READ2_B32_gfx10:
+  case AMDGPU::DS_READ2_B32_gfx6_gfx7:
+  case AMDGPU::DS_READ2_B32_vi:
+  case AMDGPU::DS_READ2_B64_gfx10:
+  case AMDGPU::DS_READ2_B64_gfx6_gfx7:
+  case AMDGPU::DS_READ2_B64_vi:
+  case AMDGPU::DS_READ_ADDTID_B32_gfx10:
+  case AMDGPU::DS_READ_ADDTID_B32_vi:
+  case AMDGPU::DS_READ_B128_gfx10:
+  case AMDGPU::DS_READ_B128_gfx7:
+  case AMDGPU::DS_READ_B128_vi:
+  case AMDGPU::DS_READ_B32_gfx10:
+  case AMDGPU::DS_READ_B32_gfx6_gfx7:
+  case AMDGPU::DS_READ_B32_vi:
+  case AMDGPU::DS_READ_B64_gfx10:
+  case AMDGPU::DS_READ_B64_gfx6_gfx7:
+  case AMDGPU::DS_READ_B64_vi:
+  case AMDGPU::DS_READ_B96_gfx10:
+  case AMDGPU::DS_READ_B96_gfx7:
+  case AMDGPU::DS_READ_B96_vi:
+  case AMDGPU::DS_READ_I16_gfx10:
+  case AMDGPU::DS_READ_I16_gfx6_gfx7:
+  case AMDGPU::DS_READ_I16_vi:
+  case AMDGPU::DS_READ_I8_D16_HI_gfx10:
+  case AMDGPU::DS_READ_I8_D16_HI_vi:
+  case AMDGPU::DS_READ_I8_D16_gfx10:
+  case AMDGPU::DS_READ_I8_D16_vi:
+  case AMDGPU::DS_READ_I8_gfx10:
+  case AMDGPU::DS_READ_I8_gfx6_gfx7:
+  case AMDGPU::DS_READ_I8_vi:
+  case AMDGPU::DS_READ_U16_D16_HI_gfx10:
+  case AMDGPU::DS_READ_U16_D16_HI_vi:
+  case AMDGPU::DS_READ_U16_D16_gfx10:
+  case AMDGPU::DS_READ_U16_D16_vi:
+  case AMDGPU::DS_READ_U16_gfx10:
+  case AMDGPU::DS_READ_U16_gfx6_gfx7:
+  case AMDGPU::DS_READ_U16_vi:
+  case AMDGPU::DS_READ_U8_D16_HI_gfx10:
+  case AMDGPU::DS_READ_U8_D16_HI_vi:
+  case AMDGPU::DS_READ_U8_D16_gfx10:
+  case AMDGPU::DS_READ_U8_D16_vi:
+  case AMDGPU::DS_READ_U8_gfx10:
+  case AMDGPU::DS_READ_U8_gfx6_gfx7:
+  case AMDGPU::DS_READ_U8_vi:
+  case AMDGPU::DS_WRITE2ST64_B32_gfx10:
+  case AMDGPU::DS_WRITE2ST64_B32_gfx6_gfx7:
+  case AMDGPU::DS_WRITE2ST64_B32_vi:
+  case AMDGPU::DS_WRITE2ST64_B64_gfx10:
+  case AMDGPU::DS_WRITE2ST64_B64_gfx6_gfx7:
+  case AMDGPU::DS_WRITE2ST64_B64_vi:
+  case AMDGPU::DS_WRITE2_B32_gfx10:
+  case AMDGPU::DS_WRITE2_B32_gfx6_gfx7:
+  case AMDGPU::DS_WRITE2_B32_vi:
+  case AMDGPU::DS_WRITE2_B64_gfx10:
+  case AMDGPU::DS_WRITE2_B64_gfx6_gfx7:
+  case AMDGPU::DS_WRITE2_B64_vi:
+  case AMDGPU::DS_WRITE_ADDTID_B32_gfx10:
+  case AMDGPU::DS_WRITE_ADDTID_B32_vi:
+  case AMDGPU::DS_WRITE_B128_gfx10:
+  case AMDGPU::DS_WRITE_B128_gfx7:
+  case AMDGPU::DS_WRITE_B128_vi:
+  case AMDGPU::DS_WRITE_B16_D16_HI_gfx10:
+  case AMDGPU::DS_WRITE_B16_D16_HI_vi:
+  case AMDGPU::DS_WRITE_B16_gfx10:
+  case AMDGPU::DS_WRITE_B16_gfx6_gfx7:
+  case AMDGPU::DS_WRITE_B16_vi:
+  case AMDGPU::DS_WRITE_B32_gfx10:
+  case AMDGPU::DS_WRITE_B32_gfx6_gfx7:
+  case AMDGPU::DS_WRITE_B32_vi:
+  case AMDGPU::DS_WRITE_B64_gfx10:
+  case AMDGPU::DS_WRITE_B64_gfx6_gfx7:
+  case AMDGPU::DS_WRITE_B64_vi:
+  case AMDGPU::DS_WRITE_B8_D16_HI_gfx10:
+  case AMDGPU::DS_WRITE_B8_D16_HI_vi:
+  case AMDGPU::DS_WRITE_B8_gfx10:
+  case AMDGPU::DS_WRITE_B8_gfx6_gfx7:
+  case AMDGPU::DS_WRITE_B8_vi:
+  case AMDGPU::DS_WRITE_B96_gfx10:
+  case AMDGPU::DS_WRITE_B96_gfx7:
+  case AMDGPU::DS_WRITE_B96_vi:
+  case AMDGPU::DS_WRITE_SRC2_B32_gfx10:
+  case AMDGPU::DS_WRITE_SRC2_B32_gfx6_gfx7:
+  case AMDGPU::DS_WRITE_SRC2_B32_vi:
+  case AMDGPU::DS_WRITE_SRC2_B64_gfx10:
+  case AMDGPU::DS_WRITE_SRC2_B64_gfx6_gfx7:
+  case AMDGPU::DS_WRITE_SRC2_B64_vi:
+    // The ds_read and ds_write instructions
+    // are not automatically detected as interacting with
+    // lgkmcnt due to their flags not being ported from
+    // the Psuedo instructions to the 'real' instructions.
+    InstrWaitCntInfo[Index].LgkmCnt = true;
+    return true;
+  }
+
+  return false;
+}
+
+// taken from SIInstrInfo::isVMEM()
+bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) {
+  return MCID.TSFlags & SIInstrFlags::MUBUF ||
+         MCID.TSFlags & SIInstrFlags::MTBUF ||
+         MCID.TSFlags & SIInstrFlags::MIMG;
+}
+
+// taken from SIInstrInfo::hasModifiersSet()
+bool AMDGPUCustomBehaviour::hasModifiersSet(
+    const std::unique_ptr<Instruction> &Inst, unsigned OpName) const {
+  int Idx = AMDGPU::getNamedOperandIdx(Inst->getOpcode(), OpName);
+  if (Idx == -1)
+    return false;
+
+  const MCAOperand *Op = Inst->getOperand(Idx);
+  if (Op == nullptr || !Op->isImm() || !Op->getImm())
+    return false;
+
+  return true;
+}
+
+// taken from SIInstrInfo::isAlwaysGDS()
+bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const {
+  return Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::DS_GWS_INIT ||
+         Opcode == AMDGPU::DS_GWS_SEMA_V || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
+         Opcode == AMDGPU::DS_GWS_SEMA_P ||
+         Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL ||
+         Opcode == AMDGPU::DS_GWS_BARRIER;
+}
+
+} // namespace mca
+} // namespace llvm
Index: llvm/tools/llvm-mca/lib/AMDGPU/CMakeLists.txt
===================================================================
--- /dev/null
+++ llvm/tools/llvm-mca/lib/AMDGPU/CMakeLists.txt
@@ -0,0 +1,17 @@
+include_directories(
+  ${LLVM_MAIN_SRC_DIR}/lib/Target/AMDGPU
+  ${LLVM_BINARY_DIR}/lib/Target/AMDGPU
+  )
+
+set(LLVM_LINK_COMPONENTS
+  AMDGPU
+  Core
+  Support
+  )
+
+add_llvm_library(LLVMMCACustomBehaviourAMDGPU
+  AMDGPUCustomBehaviour.cpp
+
+  DEPENDS
+  AMDGPUCommonTableGen
+  )
Index: llvm/tools/llvm-mca/lib/CMakeLists.txt
===================================================================
--- /dev/null
+++ llvm/tools/llvm-mca/lib/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(TARGETS_TO_APPEND "")
+
+if (LLVM_TARGETS_TO_BUILD MATCHES "AMDGPU")
+  add_subdirectory(AMDGPU)
+  list(APPEND TARGETS_TO_APPEND LLVMMCACustomBehaviourAMDGPU)
+endif()
+
+set(LLVM_MCA_CUSTOMBEHAVIOUR_TARGETS ${TARGETS_TO_APPEND} PARENT_SCOPE)
Index: llvm/tools/llvm-mca/llvm-mca.cpp
===================================================================
--- llvm/tools/llvm-mca/llvm-mca.cpp
+++ llvm/tools/llvm-mca/llvm-mca.cpp
@@ -32,6 +32,7 @@
 #include "Views/SchedulerStatistics.h"
 #include "Views/SummaryView.h"
 #include "Views/TimelineView.h"
+#include "lib/AMDGPU/AMDGPUCustomBehaviour.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeEmitter.h"
@@ -42,6 +43,7 @@
 #include "llvm/MC/MCTargetOptionsCommandFlags.h"
 #include "llvm/MCA/CodeEmitter.h"
 #include "llvm/MCA/Context.h"
+#include "llvm/MCA/CustomBehaviour.h"
 #include "llvm/MCA/InstrBuilder.h"
 #include "llvm/MCA/Pipeline.h"
 #include "llvm/MCA/Stages/EntryStage.h"
@@ -220,6 +222,12 @@
     cl::desc("Print encoding information in the instruction info view"),
     cl::cat(ViewOptions), cl::init(false));
 
+static cl::opt<bool> DisableCustomBehaviour(
+    "disable-cb",
+    cl::desc(
+        "Disable custom behaviour (use the default class which does nothing)."),
+    cl::cat(ViewOptions), cl::init(false));
+
 namespace {
 
 const Target *getTarget(const char *ProgName) {
@@ -285,6 +293,39 @@
     processOptionImpl(PrintRetireStats, Default);
 }
 
+std::unique_ptr<mca::InstrPostProcess>
+createInstrPostProcess(const Triple &TheTriple, const MCSubtargetInfo &STI,
+                       const MCInstrInfo &MCII) {
+  // Might be a good idea to have a separate flag so that InstrPostProcess
+  // can be used with or without CustomBehaviour
+  if (DisableCustomBehaviour)
+    return std::make_unique<mca::InstrPostProcess>(STI, MCII);
+
+  if (TheTriple.isAMDGPU())
+    return std::make_unique<mca::AMDGPUInstrPostProcess>(STI, MCII);
+
+  return std::make_unique<mca::InstrPostProcess>(STI, MCII);
+}
+
+std::unique_ptr<mca::CustomBehaviour>
+createCustomBehaviour(const Triple &TheTriple, const MCSubtargetInfo &STI,
+                      const mca::SourceMgr &SrcMgr, const MCInstrInfo &MCII) {
+  // Build the appropriate CustomBehaviour object for the current target.
+  // The CustomBehaviour class should never depend on the source code,
+  // but it can depend on the list of mca::Instruction and any classes
+  // that can be built using just the target info. If you need extra
+  // information from the source code or the list of MCInst, consider
+  // adding that information to the mca::Instruction class and setting
+  // it during InstrBuilder::createInstruction().
+  if (DisableCustomBehaviour)
+    return std::make_unique<mca::CustomBehaviour>(STI, SrcMgr, MCII);
+
+  if (TheTriple.isAMDGPU())
+    return std::make_unique<mca::AMDGPUCustomBehaviour>(STI, SrcMgr, MCII);
+
+  return std::make_unique<mca::CustomBehaviour>(STI, SrcMgr, MCII);
+}
+
 // Returns true on success.
 static bool runPipeline(mca::Pipeline &P) {
   // Handle pipeline errors here.
@@ -498,6 +539,8 @@
     // Lower the MCInst sequence into an mca::Instruction sequence.
     ArrayRef<MCInst> Insts = Region->getInstructions();
     mca::CodeEmitter CE(*STI, *MAB, *MCE, Insts);
+    std::unique_ptr<mca::InstrPostProcess> IPP =
+        createInstrPostProcess(TheTriple, *STI, *MCII);
     std::vector<std::unique_ptr<mca::Instruction>> LoweredSequence;
     for (const MCInst &MCI : Insts) {
       Expected<std::unique_ptr<mca::Instruction>> Inst =
@@ -520,6 +563,8 @@
         return 1;
       }
 
+      IPP->postProcessInstruction(Inst.get(), MCI);
+
       LoweredSequence.emplace_back(std::move(Inst.get()));
     }
 
@@ -547,8 +592,17 @@
       continue;
     }
 
+    // Create the CustomBehaviour object for enforcing Target Specific
+    // behaviours and dependencies that aren't expressed well enough
+    // in the tablegen. CB cannot depend on the list of MCInst or
+    // the source code (but it can depend on the list of
+    // mca::Instruction or any objects that can be reconstructed
+    // from the target information).
+    std::unique_ptr<mca::CustomBehaviour> CB =
+        createCustomBehaviour(TheTriple, *STI, S, *MCII);
+
     // Create a basic pipeline simulating an out-of-order backend.
-    auto P = MCA.createDefaultPipeline(PO, S);
+    auto P = MCA.createDefaultPipeline(PO, S, *CB);
     mca::PipelinePrinter Printer(*P, PrintJson ? mca::View::OK_JSON
                                                : mca::View::OK_READABLE);