Index: lib/Target/AMDGPU/SIMemoryLegalizer.cpp
===================================================================
--- lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -21,6 +21,7 @@
 #include "SIInstrInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -37,6 +38,7 @@
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/MathExtras.h"
 #include <cassert>
 #include <list>
 
@@ -48,42 +50,143 @@
 
 namespace {
 
+LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
+
+/// Memory operation flags. Can be ORed toether.
+enum class SIMemOp {
+  NONE = 0u,
+  LOAD = 1u << 0,
+  STORE = 1u << 1,
+  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
+};
+
+/// Position to instert a new instruction relative to an existing
+/// instruction.
+enum class Position {
+  BEFORE,
+  AFTER
+};
+
+/// The atomic synchronization scopes supported by the AMDGPU target.
+enum class SIAtomicScope {
+  NONE,
+  SINGLETHREAD,
+  WAVEFRONT,
+  WORKGROUP,
+  AGENT,
+  SYSTEM
+};
+
+/// The distinct address spaces supported by the AMDGPU target for
+/// atomic memory operation. Can be ORed toether.
+enum class SIAtomicAddrSpace {
+  NONE = 0u,
+  GLOBAL = 1u << 0,
+  LDS = 1u << 1,
+  SCRATCH = 1u << 2,
+  GDS = 1u << 3,
+  OTHER = 1u << 4,
+
+  /// The address spaces that can be accessed by a FLAT instruction.
+  FLAT = GLOBAL | LDS | SCRATCH,
+
+  /// The address spaces that support atomic instructions.
+  ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
+
+  /// All address spaces.
+  ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
+
+  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
+};
+
+/// Sets named bit \p BitName to "true" if present in instruction \p
+/// MI.
+/// \returns Returns true if \p MI is modified, false otherwise.
+template <uint16_t BitName>
+bool enableNamedBit(const MachineBasicBlock::iterator &MI) {
+  int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
+  if (BitIdx == -1)
+    return false;
+
+  MachineOperand &Bit = MI->getOperand(BitIdx);
+  if (Bit.getImm() != 0)
+    return false;
+
+  Bit.setImm(1);
+  return true;
+}
+
 class SIMemOpInfo final {
 private:
-  SyncScope::ID SSID = SyncScope::System;
+
+  friend class SIMemOpAccess;
+
   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
+  SIAtomicScope Scope = SIAtomicScope::SYSTEM;
+  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
+  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
+  bool IsCrossAddressSpaceOrdering = false;
   bool IsNonTemporal = false;
 
-  SIMemOpInfo(SyncScope::ID SSID, AtomicOrdering Ordering)
-      : SSID(SSID), Ordering(Ordering) {}
-
-  SIMemOpInfo(SyncScope::ID SSID, AtomicOrdering Ordering,
-              AtomicOrdering FailureOrdering, bool IsNonTemporal = false)
-      : SSID(SSID), Ordering(Ordering), FailureOrdering(FailureOrdering),
-        IsNonTemporal(IsNonTemporal) {}
-
-  /// \returns Info constructed from \p MI, which has at least machine memory
-  /// operand.
-  static Optional<SIMemOpInfo> constructFromMIWithMMO(
-      const MachineBasicBlock::iterator &MI);
+  SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
+              SIAtomicScope Scope = SIAtomicScope::SYSTEM,
+              SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
+              SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
+              bool IsCrossAddressSpaceOrdering = true,
+              AtomicOrdering FailureOrdering =
+                AtomicOrdering::SequentiallyConsistent,
+              bool IsNonTemporal = false)
+    : Ordering(Ordering), FailureOrdering(FailureOrdering),
+      Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
+      InstrAddrSpace(InstrAddrSpace),
+      IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
+      IsNonTemporal(IsNonTemporal) {
+    // There is also no cross address space ordering if the ordering
+    // address space is the same as the instruction address space and
+    // only contains a single address space.
+    if ((OrderingAddrSpace == InstrAddrSpace) &&
+        isPowerOf2_32(uint32_t(InstrAddrSpace)))
+      IsCrossAddressSpaceOrdering = false;
+  }
 
 public:
-  /// \returns Synchronization scope ID of the machine instruction used to
+  /// \returns Atomic synchronization scope of the machine instruction used to
   /// create this SIMemOpInfo.
-  SyncScope::ID getSSID() const {
-    return SSID;
+  SIAtomicScope getScope() const {
+    return Scope;
   }
+
   /// \returns Ordering constraint of the machine instruction used to
   /// create this SIMemOpInfo.
   AtomicOrdering getOrdering() const {
     return Ordering;
   }
+
   /// \returns Failure ordering constraint of the machine instruction used to
   /// create this SIMemOpInfo.
   AtomicOrdering getFailureOrdering() const {
     return FailureOrdering;
   }
+
+  /// \returns The address spaces be accessed by the machine
+  /// instruction used to create this SiMemOpInfo.
+  SIAtomicAddrSpace getInstrAddrSpace() const {
+    return InstrAddrSpace;
+  }
+
+  /// \returns The address spaces that must be ordered by the machine
+  /// instruction used to create this SiMemOpInfo.
+  SIAtomicAddrSpace getOrderingAddrSpace() const {
+    return OrderingAddrSpace;
+  }
+
+  /// \returns Return true iff memory ordering of operations on
+  /// different address spaces is required.
+  bool getIsCrossAddressSpaceOrdering() const {
+    return IsCrossAddressSpaceOrdering;
+  }
+
   /// \returns True if memory access of the machine instruction used to
   /// create this SIMemOpInfo is non-temporal, false otherwise.
   bool isNonTemporal() const {
@@ -96,59 +199,110 @@
     return Ordering != AtomicOrdering::NotAtomic;
   }
 
-  /// \returns Load info if \p MI is a load operation, "None" otherwise.
-  static Optional<SIMemOpInfo> getLoadInfo(
+};
+
+class SIMemOpAccess final {
+private:
+
+  AMDGPUAS SIAddrSpaceInfo;
+  AMDGPUMachineModuleInfo *MMI = nullptr;
+
+  /// Reports unsupported message \p Msg for \p MI to LLVM context.
+  void reportUnsupported(const MachineBasicBlock::iterator &MI,
+                         const char *msg);
+
+  /// Instepects the target symchonization scope \p SSID and
+  /// determines the SI atomic scope it corresponds to, the address
+  /// spaces it covers, and whether the the memory ordering applies
+  /// between address spaces.
+  Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
+  toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope);
+
+  /// \return Return a bit set of the address spaces accessed by \p
+  /// AS.
+  SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS);
+
+  /// \returns Info constructed from \p MI, which has at least machine memory
+  /// operand.
+  Optional<SIMemOpInfo> constructFromMIWithMMO(
       const MachineBasicBlock::iterator &MI);
+
+public:
+  /// Construct class to support accessing the machine memory operands
+  /// of instructions in the machine function \p MF.
+  SIMemOpAccess(MachineFunction &MF);
+
+  /// \returns Load info if \p MI is a load operation, "None" otherwise.
+  Optional<SIMemOpInfo> getLoadInfo(const MachineBasicBlock::iterator &MI);
+
   /// \returns Store info if \p MI is a store operation, "None" otherwise.
-  static Optional<SIMemOpInfo> getStoreInfo(
-      const MachineBasicBlock::iterator &MI);
+  Optional<SIMemOpInfo> getStoreInfo(const MachineBasicBlock::iterator &MI);
+
   /// \returns Atomic fence info if \p MI is an atomic fence operation,
   /// "None" otherwise.
-  static Optional<SIMemOpInfo> getAtomicFenceInfo(
+  Optional<SIMemOpInfo> getAtomicFenceInfo(
       const MachineBasicBlock::iterator &MI);
+
   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
   /// rmw operation, "None" otherwise.
-  static Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
-      const MachineBasicBlock::iterator &MI);
-
-  /// Reports unknown synchronization scope used in \p MI to LLVM
-  /// context.
-  static void reportUnknownSyncScope(
+  Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
       const MachineBasicBlock::iterator &MI);
 };
 
-class SIMemoryLegalizer final : public MachineFunctionPass {
-private:
-  /// Machine module info.
-  const AMDGPUMachineModuleInfo *MMI = nullptr;
+class SICacheControl {
+protected:
 
   /// Instruction info.
   const SIInstrInfo *TII = nullptr;
 
-  /// Immediate for "vmcnt(0)".
-  unsigned Vmcnt0Immediate = 0;
-
-  /// Opcode for cache invalidation instruction (L1).
-  unsigned VmemSIMDCacheInvalidateOpc = 0;
+  IsaInfo::IsaVersion IV;
 
-  /// List of atomic pseudo instructions.
-  std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
+  SICacheControl(const SISubtarget &ST);
 
-  /// Sets named bit (BitName) to "true" if present in \p MI. Returns
-  /// true if \p MI is modified, false otherwise.
-  template <uint16_t BitName>
-  bool enableNamedBit(const MachineBasicBlock::iterator &MI) const {
-    int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
-    if (BitIdx == -1)
-      return false;
+public:
 
-    MachineOperand &Bit = MI->getOperand(BitIdx);
-    if (Bit.getImm() != 0)
-      return false;
+  /// Create a cache control for the subtarget \p ST.
+  static std::unique_ptr<SICacheControl> create(const SISubtarget &ST);
+
+  /// Update \p MI memory load instruction to bypass any caches up to
+  /// the \p Scope memory scope for address spaces \p
+  /// AddrSpace. Return true iff the instruction was modified.
+  virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
+                                     SIAtomicScope Scope,
+                                     SIAtomicAddrSpace AddrSpace) const = 0;
+
+  /// Update \p MI memory instruction to indicate it is
+  /// nontemporal. Return true iff the instruction was modified.
+  virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI)
+    const = 0;
+
+  /// Inserts any necessary instructions at position \p Pos relative
+  /// to instruction \p MI to ensure any caches associated with
+  /// address spaces \p AddrSpace for memory scopes up to memory scope
+  /// \p Scope are invalidated. Returns true iff any instructions
+  /// inserted.
+  virtual bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+                                     SIAtomicScope Scope,
+                                     SIAtomicAddrSpace AddrSpace,
+                                     Position Pos) const = 0;
+
+  /// Inserts any necessary instructions at position \p Pos relative
+  /// to instruction \p MI to ensure memory instructions of kind \p Op
+  /// associated with address spaces \p AddrSpace have completed as
+  /// observed by other memory instructions executing in memory scope
+  /// \p Scope. \p IsCrossAddrSpaceOrdering indicates if the memory
+  /// ordering is between address spaces. Returns true iff any
+  /// instructions inserted.
+  virtual bool insertWait(MachineBasicBlock::iterator &MI,
+                          SIAtomicScope Scope,
+                          SIAtomicAddrSpace AddrSpace,
+                          SIMemOp Op,
+                          bool IsCrossAddrSpaceOrdering,
+                          Position Pos) const = 0;
+};
 
-    Bit.setImm(1);
-    return true;
-  }
+class SIGfx6CacheControl : public SICacheControl {
+protected:
 
   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
   /// is modified, false otherwise.
@@ -162,14 +316,55 @@
     return enableNamedBit<AMDGPU::OpName::slc>(MI);
   }
 
-  /// Inserts "buffer_wbinvl1_vol" instruction \p Before or after \p MI.
-  /// Always returns true.
-  bool insertVmemSIMDCacheInvalidate(MachineBasicBlock::iterator &MI,
-                                     bool Before = true) const;
-  /// Inserts "s_waitcnt vmcnt(0)" instruction \p Before or after \p MI.
-  /// Always returns true.
-  bool insertWaitcntVmcnt0(MachineBasicBlock::iterator &MI,
-                           bool Before = true) const;
+public:
+
+  SIGfx6CacheControl(const SISubtarget &ST) : SICacheControl(ST) {};
+
+  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
+                             SIAtomicScope Scope,
+                             SIAtomicAddrSpace AddrSpace) const override;
+
+  bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
+
+  bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+                             SIAtomicScope Scope,
+                             SIAtomicAddrSpace AddrSpace,
+                             Position Pos) const override;
+
+  bool insertWait(MachineBasicBlock::iterator &MI,
+                  SIAtomicScope Scope,
+                  SIAtomicAddrSpace AddrSpace,
+                  SIMemOp Op,
+                  bool IsCrossAddrSpaceOrdering,
+                  Position Pos) const override;
+};
+
+class SIGfx7CacheControl : public SIGfx6CacheControl {
+public:
+
+  SIGfx7CacheControl(const SISubtarget &ST) : SIGfx6CacheControl(ST) {};
+
+  bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+                             SIAtomicScope Scope,
+                             SIAtomicAddrSpace AddrSpace,
+                             Position Pos) const override;
+
+};
+
+class SIMemoryLegalizer final : public MachineFunctionPass {
+private:
+
+  /// Cache Control.
+  std::unique_ptr<SICacheControl> CC = nullptr;
+
+  /// List of atomic pseudo instructions.
+  std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
+
+  /// Return true iff instruction \p MI is a rmw atomic instruction
+  /// that returns a result.
+  bool isAtomicRet(const MachineInstr &MI) const {
+    return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1;
+  }
 
   /// Removes all processed atomic pseudo instructions from the current
   /// function. Returns true if current function is modified, false otherwise.
@@ -211,47 +406,129 @@
 
 } // end namespace anonymous
 
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::constructFromMIWithMMO(
+void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
+                                      const char *Msg) {
+  const Function &Func = MI->getParent()->getParent()->getFunction();
+  DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
+  Func.getContext().diagnose(Diag);
+}
+
+Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
+SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
+                               SIAtomicAddrSpace InstrScope) {
+  /// TODO For now assume OpenCL memory model which treats each
+  /// address space as having a separate happens-before relation, and
+  /// so an instruction only has ordering with respect to the address
+  /// space it accesses, and if it accesses multiple address spaces it
+  /// does not require ordering of operations in different address
+  /// spaces.
+ if (SSID == SyncScope::System)
+    return std::make_tuple(SIAtomicScope::SYSTEM,
+                           SIAtomicAddrSpace::ATOMIC & InstrScope,
+                           false);
+  if (SSID == MMI->getAgentSSID())
+    return std::make_tuple(SIAtomicScope::AGENT,
+                           SIAtomicAddrSpace::ATOMIC & InstrScope,
+                           false);
+  if (SSID == MMI->getWorkgroupSSID())
+    return std::make_tuple(SIAtomicScope::WORKGROUP,
+                           SIAtomicAddrSpace::ATOMIC & InstrScope,
+                           false);
+  if (SSID == MMI->getWavefrontSSID())
+    return std::make_tuple(SIAtomicScope::WAVEFRONT,
+                           SIAtomicAddrSpace::ATOMIC & InstrScope,
+                           false);
+  if (SSID == SyncScope::SingleThread)
+    return std::make_tuple(SIAtomicScope::SINGLETHREAD,
+                           SIAtomicAddrSpace::ATOMIC & InstrScope,
+                           false);
+  /// TODO To support HSA Memory Model need to add additional memory
+  /// scopes that specify that do require cross address space
+  /// ordering.
+  return None;
+}
+
+SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) {
+  if (AS == SIAddrSpaceInfo.FLAT_ADDRESS)
+    return SIAtomicAddrSpace::FLAT;
+  if (AS == SIAddrSpaceInfo.GLOBAL_ADDRESS)
+    return SIAtomicAddrSpace::GLOBAL;
+  if (AS == SIAddrSpaceInfo.LOCAL_ADDRESS)
+    return SIAtomicAddrSpace::LDS;
+  if (AS == SIAddrSpaceInfo.PRIVATE_ADDRESS)
+    return SIAtomicAddrSpace::SCRATCH;
+  if (AS == SIAddrSpaceInfo.REGION_ADDRESS)
+    return SIAtomicAddrSpace::GDS;
+
+  return SIAtomicAddrSpace::OTHER;
+}
+
+SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
+  SIAddrSpaceInfo = getAMDGPUAS(MF.getTarget());
+  MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
+}
+
+Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
     const MachineBasicBlock::iterator &MI) {
   assert(MI->getNumMemOperands() > 0);
 
-  const MachineFunction *MF = MI->getParent()->getParent();
-  const AMDGPUMachineModuleInfo *MMI =
-      &MF->getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
-
   SyncScope::ID SSID = SyncScope::SingleThread;
   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
+  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
   bool IsNonTemporal = true;
 
   // Validator should check whether or not MMOs cover the entire set of
   // locations accessed by the memory instruction.
   for (const auto &MMO : MI->memoperands()) {
-    const auto &IsSyncScopeInclusion =
-        MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
-    if (!IsSyncScopeInclusion) {
-      reportUnknownSyncScope(MI);
-      return None;
-    }
-
-    SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
-    Ordering =
-        isStrongerThan(Ordering, MMO->getOrdering()) ?
-            Ordering : MMO->getOrdering();
-    FailureOrdering =
-        isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
-            FailureOrdering : MMO->getFailureOrdering();
-
     if (!(MMO->getFlags() & MachineMemOperand::MONonTemporal))
       IsNonTemporal = false;
+    InstrAddrSpace |=
+      toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
+    AtomicOrdering OpOrdering = MMO->getOrdering();
+    if (OpOrdering != AtomicOrdering::NotAtomic) {
+      const auto &IsSyncScopeInclusion =
+          MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
+      if (!IsSyncScopeInclusion) {
+        reportUnsupported(MI,
+          "Unsupported non-inclusive atomic synchronization scope");
+        return None;
+      }
+
+      SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
+      Ordering =
+          isStrongerThan(Ordering, OpOrdering) ?
+              Ordering : MMO->getOrdering();
+      assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
+             MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
+      FailureOrdering =
+          isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
+              FailureOrdering : MMO->getFailureOrdering();
+    }
   }
 
-  return SIMemOpInfo(SSID, Ordering, FailureOrdering, IsNonTemporal);
+  SIAtomicScope Scope = SIAtomicScope::NONE;
+  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
+  bool IsCrossAddressSpaceOrdering = false;
+  if (Ordering != AtomicOrdering::NotAtomic) {
+    auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
+    if (!ScopeOrNone) {
+      reportUnsupported(MI, "Unsupported atomic synchronization scope");
+      return None;
+    }
+    std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
+      ScopeOrNone.getValue();
+    if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
+        ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
+      reportUnsupported(MI, "Unsupported atomic address space");
+      return None;
+    }
+  }
+  return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
+                     IsCrossAddressSpaceOrdering, FailureOrdering, IsNonTemporal);
 }
 
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::getLoadInfo(
+Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
     const MachineBasicBlock::iterator &MI) {
   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
 
@@ -260,14 +537,12 @@
 
   // Be conservative if there are no memory operands.
   if (MI->getNumMemOperands() == 0)
-    return SIMemOpInfo(SyncScope::System,
-                       AtomicOrdering::SequentiallyConsistent);
+    return SIMemOpInfo();
 
-  return SIMemOpInfo::constructFromMIWithMMO(MI);
+  return constructFromMIWithMMO(MI);
 }
 
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::getStoreInfo(
+Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
     const MachineBasicBlock::iterator &MI) {
   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
 
@@ -276,29 +551,45 @@
 
   // Be conservative if there are no memory operands.
   if (MI->getNumMemOperands() == 0)
-    return SIMemOpInfo(SyncScope::System,
-                       AtomicOrdering::SequentiallyConsistent);
+    return SIMemOpInfo();
 
-  return SIMemOpInfo::constructFromMIWithMMO(MI);
+  return constructFromMIWithMMO(MI);
 }
 
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::getAtomicFenceInfo(
+Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
     const MachineBasicBlock::iterator &MI) {
   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
 
   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
     return None;
 
-  SyncScope::ID SSID =
-      static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
   AtomicOrdering Ordering =
-      static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
-  return SIMemOpInfo(SSID, Ordering);
+    static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
+
+  SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
+  auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
+  if (!ScopeOrNone) {
+    reportUnsupported(MI, "Unsupported atomic synchronization scope");
+    return None;
+  }
+
+  SIAtomicScope Scope = SIAtomicScope::NONE;
+  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
+  bool IsCrossAddressSpaceOrdering = false;
+  std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
+    ScopeOrNone.getValue();
+
+  if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
+      ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
+    reportUnsupported(MI, "Unsupported atomic address space");
+    return None;
+  }
+
+  return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
+                     IsCrossAddressSpaceOrdering);
 }
 
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::getAtomicCmpxchgOrRmwInfo(
+Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
     const MachineBasicBlock::iterator &MI) {
   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
 
@@ -307,52 +598,251 @@
 
   // Be conservative if there are no memory operands.
   if (MI->getNumMemOperands() == 0)
-    return SIMemOpInfo(SyncScope::System,
-                       AtomicOrdering::SequentiallyConsistent,
-                       AtomicOrdering::SequentiallyConsistent);
+    return SIMemOpInfo();
 
-  return SIMemOpInfo::constructFromMIWithMMO(MI);
+  return constructFromMIWithMMO(MI);
+}
+
+SICacheControl::SICacheControl(const SISubtarget &ST) {
+  TII = ST.getInstrInfo();
+  IV = IsaInfo::getIsaVersion(ST.getFeatureBits());
 }
 
 /* static */
-void SIMemOpInfo::reportUnknownSyncScope(
-    const MachineBasicBlock::iterator &MI) {
-  DiagnosticInfoUnsupported Diag(MI->getParent()->getParent()->getFunction(),
-                                 "Unsupported synchronization scope");
-  LLVMContext *CTX = &MI->getParent()->getParent()->getFunction().getContext();
-  CTX->diagnose(Diag);
+std::unique_ptr<SICacheControl> SICacheControl::create(const SISubtarget &ST) {
+  AMDGPUSubtarget::Generation Generation = ST.getGeneration();
+  if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
+    return make_unique<SIGfx6CacheControl>(ST);
+  return make_unique<SIGfx7CacheControl>(ST);
+}
+
+bool SIGfx6CacheControl::enableLoadCacheBypass(
+    const MachineBasicBlock::iterator &MI,
+    SIAtomicScope Scope,
+    SIAtomicAddrSpace AddrSpace) const {
+  assert(MI->mayLoad() && !MI->mayStore());
+  bool Changed = false;
+
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    /// TODO Do not set glc for rmw atomic operations as they
+    /// implicitly bypass the L1 cache.
+
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+      Changed |= enableGLCBit(MI);
+      break;
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // No cache to bypass.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  /// The scratch address space does not need the global memory caches
+  /// to be bypassed as all memory operations by the same thread are
+  /// sequentially consistent, and no other thread can access scratch
+  /// memory.
+
+  /// Other address spaces do not hava a cache.
+
+  return Changed;
+}
+
+bool SIGfx6CacheControl::enableNonTemporal(
+    const MachineBasicBlock::iterator &MI) const {
+  assert(MI->mayLoad() ^ MI->mayStore());
+  bool Changed = false;
+
+  /// TODO Do not enableGLCBit if rmw atomic.
+  Changed |= enableGLCBit(MI);
+  Changed |= enableSLCBit(MI);
+
+  return Changed;
 }
 
-bool SIMemoryLegalizer::insertVmemSIMDCacheInvalidate(
-  MachineBasicBlock::iterator &MI, bool Before) const {
+bool SIGfx6CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+                                               SIAtomicScope Scope,
+                                               SIAtomicAddrSpace AddrSpace,
+                                               Position Pos) const {
+  bool Changed = false;
+
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
 
-  if (!Before)
+  if (Pos == Position::AFTER)
     ++MI;
 
-  BuildMI(MBB, MI, DL, TII->get(VmemSIMDCacheInvalidateOpc));
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
+      Changed = true;
+      break;
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // No cache to invalidate.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  /// The scratch address space does not need the global memory cache
+  /// to be flushed as all memory operations by the same thread are
+  /// sequentially consistent, and no other thread can access scratch
+  /// memory.
+
+  /// Other address spaces do not hava a cache.
 
-  if (!Before)
+  if (Pos == Position::AFTER)
     --MI;
 
-  return true;
+  return Changed;
 }
 
-bool SIMemoryLegalizer::insertWaitcntVmcnt0(MachineBasicBlock::iterator &MI,
-                                            bool Before) const {
+bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
+                                    SIAtomicScope Scope,
+                                    SIAtomicAddrSpace AddrSpace,
+                                    SIMemOp Op,
+                                    bool IsCrossAddrSpaceOrdering,
+                                    Position Pos) const {
+  bool Changed = false;
+
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
 
-  if (!Before)
+  if (Pos == Position::AFTER)
     ++MI;
 
-  BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Vmcnt0Immediate);
+  bool VMCnt = false;
+  bool LGKMCnt = false;
+  bool EXPCnt = false;
+
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+      VMCnt = true;
+      break;
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // The L1 cache keeps all memory operations in order for
+      // wavesfronts in the same work-group.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
 
-  if (!Before)
+  if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+    case SIAtomicScope::WORKGROUP:
+      // If no cross address space ordering then an LDS waitcnt is not
+      // needed as LDS operations for all waves are executed in a
+      // total global ordering as observed by all waves. Required if
+      // also synchronizing with global/GDS memory as LDS operations
+      // could be reordered with respect to later global/GDS memory
+      // operations of the same wave.
+      LGKMCnt = IsCrossAddrSpaceOrdering;
+      break;
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // The LDS keeps all memory operations in order for
+      // the same wavesfront.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+      // If no cross address space ordering then an GDS waitcnt is not
+      // needed as GDS operations for all waves are executed in a
+      // total global ordering as observed by all waves. Required if
+      // also synchronizing with global/LDS memory as GDS operations
+      // could be reordered with respect to later global/LDS memory
+      // operations of the same wave.
+      EXPCnt = IsCrossAddrSpaceOrdering;
+      break;
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // The GDS keeps all memory operations in order for
+      // the same work-group.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  if (VMCnt || LGKMCnt || EXPCnt) {
+    unsigned WaitCntImmediate =
+      AMDGPU::encodeWaitcnt(IV,
+                            VMCnt ? 0 : getVmcntBitMask(IV),
+                            EXPCnt ? 0 : getExpcntBitMask(IV),
+                            LGKMCnt ? 0 : getLgkmcntBitMask(IV));
+    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
+    Changed = true;
+  }
+
+  if (Pos == Position::AFTER)
     --MI;
 
-  return true;
+  return Changed;
+}
+
+bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+                                               SIAtomicScope Scope,
+                                               SIAtomicAddrSpace AddrSpace,
+                                               Position Pos) const {
+  bool Changed = false;
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+
+  if (Pos == Position::AFTER)
+    ++MI;
+
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1_VOL));
+      Changed = true;
+      break;
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // No cache to invalidate.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  /// The scratch address space does not need the global memory cache
+  /// to be flushed as all memory operations by the same thread are
+  /// sequentially consistent, and no other thread can access scratch
+  /// memory.
+
+  /// Other address spaces do not hava a cache.
+
+  if (Pos == Position::AFTER)
+    --MI;
+
+  return Changed;
 }
 
 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
@@ -373,38 +863,38 @@
   bool Changed = false;
 
   if (MOI.isAtomic()) {
-    if (MOI.getSSID() == SyncScope::System ||
-        MOI.getSSID() == MMI->getAgentSSID()) {
-      if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
-          MOI.getOrdering() == AtomicOrdering::Acquire ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
-        Changed |= enableGLCBit(MI);
-
-      if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
-        Changed |= insertWaitcntVmcnt0(MI);
-
-      if (MOI.getOrdering() == AtomicOrdering::Acquire ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
-        Changed |= insertWaitcntVmcnt0(MI, false);
-        Changed |= insertVmemSIMDCacheInvalidate(MI, false);
-      }
-
-      return Changed;
+    if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
+        MOI.getOrdering() == AtomicOrdering::Acquire ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
+      Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
+                                           MOI.getOrderingAddrSpace());
     }
 
-    if (MOI.getSSID() == SyncScope::SingleThread ||
-        MOI.getSSID() == MMI->getWorkgroupSSID() ||
-        MOI.getSSID() == MMI->getWavefrontSSID()) {
-      return Changed;
+    if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+      Changed |= CC->insertWait(MI, MOI.getScope(),
+                                MOI.getOrderingAddrSpace(),
+                                SIMemOp::LOAD | SIMemOp::STORE,
+                                MOI.getIsCrossAddressSpaceOrdering(),
+                                Position::BEFORE);
+
+    if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
+      Changed |= CC->insertWait(MI, MOI.getScope(),
+                                MOI.getInstrAddrSpace(),
+                                SIMemOp::LOAD,
+                                MOI.getIsCrossAddressSpaceOrdering(),
+                                Position::AFTER);
+      Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
+                                           MOI.getOrderingAddrSpace(),
+                                           Position::AFTER);
     }
 
-    llvm_unreachable("Unsupported synchronization scope");
+    return Changed;
   }
 
   // Atomic instructions do not have the nontemporal attribute.
   if (MOI.isNonTemporal()) {
-    Changed |= enableGLCBit(MI);
-    Changed |= enableSLCBit(MI);
+    Changed |= CC->enableNonTemporal(MI);
     return Changed;
   }
 
@@ -418,28 +908,20 @@
   bool Changed = false;
 
   if (MOI.isAtomic()) {
-    if (MOI.getSSID() == SyncScope::System ||
-        MOI.getSSID() == MMI->getAgentSSID()) {
-      if (MOI.getOrdering() == AtomicOrdering::Release ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
-        Changed |= insertWaitcntVmcnt0(MI);
+    if (MOI.getOrdering() == AtomicOrdering::Release ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+      Changed |= CC->insertWait(MI, MOI.getScope(),
+                                MOI.getOrderingAddrSpace(),
+                                SIMemOp::LOAD | SIMemOp::STORE,
+                                MOI.getIsCrossAddressSpaceOrdering(),
+                                Position::BEFORE);
 
-      return Changed;
-    }
-
-    if (MOI.getSSID() == SyncScope::SingleThread ||
-        MOI.getSSID() == MMI->getWorkgroupSSID() ||
-        MOI.getSSID() == MMI->getWavefrontSSID()) {
-      return Changed;
-    }
-
-    llvm_unreachable("Unsupported synchronization scope");
+    return Changed;
   }
 
   // Atomic instructions do not have the nontemporal attribute.
   if (MOI.isNonTemporal()) {
-    Changed |= enableGLCBit(MI);
-    Changed |= enableSLCBit(MI);
+    Changed |= CC->enableNonTemporal(MI);
     return Changed;
   }
 
@@ -450,34 +932,35 @@
                                           MachineBasicBlock::iterator &MI) {
   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
 
+  AtomicPseudoMIs.push_back(MI);
   bool Changed = false;
 
   if (MOI.isAtomic()) {
-    if (MOI.getSSID() == SyncScope::System ||
-        MOI.getSSID() == MMI->getAgentSSID()) {
-      if (MOI.getOrdering() == AtomicOrdering::Acquire ||
-          MOI.getOrdering() == AtomicOrdering::Release ||
-          MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
-        Changed |= insertWaitcntVmcnt0(MI);
-
-      if (MOI.getOrdering() == AtomicOrdering::Acquire ||
-          MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
-        Changed |= insertVmemSIMDCacheInvalidate(MI);
-
-      AtomicPseudoMIs.push_back(MI);
-      return Changed;
-    }
+    if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+        MOI.getOrdering() == AtomicOrdering::Release ||
+        MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+      /// TODO This relies on a barrier always generating a waitcnt
+      /// for LDS to ensure it is not reordered with the completion of
+      /// the proceeding LDS operations. If barrier had a memory
+      /// ordering and memory scope, then library does not need to
+      /// generate a fence. Could hadd support in this file for
+      /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
+      /// adding waitcnt before a S_BARRIER.
+      Changed |= CC->insertWait(MI, MOI.getScope(),
+                                MOI.getOrderingAddrSpace(),
+                                SIMemOp::LOAD | SIMemOp::STORE,
+                                MOI.getIsCrossAddressSpaceOrdering(),
+                                Position::BEFORE);
+
+    if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+        MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+      Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
+                                           MOI.getOrderingAddrSpace(),
+                                           Position::BEFORE);
 
-    if (MOI.getSSID() == SyncScope::SingleThread ||
-        MOI.getSSID() == MMI->getWorkgroupSSID() ||
-        MOI.getSSID() == MMI->getWavefrontSSID()) {
-      AtomicPseudoMIs.push_back(MI);
-      return Changed;
-    }
-
-    SIMemOpInfo::reportUnknownSyncScope(MI);
+    return Changed;
   }
 
   return Changed;
@@ -490,34 +973,33 @@
   bool Changed = false;
 
   if (MOI.isAtomic()) {
-    if (MOI.getSSID() == SyncScope::System ||
-        MOI.getSSID() == MMI->getAgentSSID()) {
-      if (MOI.getOrdering() == AtomicOrdering::Release ||
-          MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
-          MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
-        Changed |= insertWaitcntVmcnt0(MI);
-
-      if (MOI.getOrdering() == AtomicOrdering::Acquire ||
-          MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
-          MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
-          MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
-        Changed |= insertWaitcntVmcnt0(MI, false);
-        Changed |= insertVmemSIMDCacheInvalidate(MI, false);
-      }
-
-      return Changed;
-    }
-
-    if (MOI.getSSID() == SyncScope::SingleThread ||
-        MOI.getSSID() == MMI->getWorkgroupSSID() ||
-        MOI.getSSID() == MMI->getWavefrontSSID()) {
-      Changed |= enableGLCBit(MI);
-      return Changed;
+    if (MOI.getOrdering() == AtomicOrdering::Release ||
+        MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
+        MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
+      Changed |= CC->insertWait(MI, MOI.getScope(),
+                                MOI.getOrderingAddrSpace(),
+                                SIMemOp::LOAD | SIMemOp::STORE,
+                                MOI.getIsCrossAddressSpaceOrdering(),
+                                Position::BEFORE);
+
+    if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+        MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
+        MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
+        MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
+      Changed |= CC->insertWait(MI, MOI.getScope(),
+                                MOI.getOrderingAddrSpace(),
+                                isAtomicRet(*MI) ? SIMemOp::LOAD :
+                                                   SIMemOp::STORE,
+                                MOI.getIsCrossAddressSpaceOrdering(),
+                                Position::AFTER);
+      Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
+                                           MOI.getOrderingAddrSpace(),
+                                           Position::AFTER);
     }
 
-    llvm_unreachable("Unsupported synchronization scope");
+    return Changed;
   }
 
   return Changed;
@@ -525,30 +1007,22 @@
 
 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
-  const IsaInfo::IsaVersion IV = IsaInfo::getIsaVersion(ST.getFeatureBits());
-
-  MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
-  TII = ST.getInstrInfo();
 
-  Vmcnt0Immediate =
-      AMDGPU::encodeWaitcnt(IV, 0, getExpcntBitMask(IV), getLgkmcntBitMask(IV));
-  VmemSIMDCacheInvalidateOpc =
-     ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS ?
-       AMDGPU::BUFFER_WBINVL1 : AMDGPU::BUFFER_WBINVL1_VOL;
+  SIMemOpAccess MOA(MF);
+  CC = SICacheControl::create(MF.getSubtarget<SISubtarget>());
 
   for (auto &MBB : MF) {
     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
         continue;
 
-      if (const auto &MOI = SIMemOpInfo::getLoadInfo(MI))
+      if (const auto &MOI = MOA.getLoadInfo(MI))
         Changed |= expandLoad(MOI.getValue(), MI);
-      else if (const auto &MOI = SIMemOpInfo::getStoreInfo(MI))
+      else if (const auto &MOI = MOA.getStoreInfo(MI))
         Changed |= expandStore(MOI.getValue(), MI);
-      else if (const auto &MOI = SIMemOpInfo::getAtomicFenceInfo(MI))
+      else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
         Changed |= expandAtomicFence(MOI.getValue(), MI);
-      else if (const auto &MOI = SIMemOpInfo::getAtomicCmpxchgOrRmwInfo(MI))
+      else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
         Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
     }
   }
Index: test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir
@@ -0,0 +1,265 @@
+# RUN: not llc -march=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer %s -o - 2>&1 | FileCheck %s
+
+--- |
+  ; ModuleID = 'memory-legalizer-invalid-addrspace.ll'
+  source_filename = "memory-legalizer-invalid-addrspace.ll"
+  target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+  target triple = "amdgcn-amd-"
+
+  define amdgpu_kernel void @invalid_load(i32 addrspace(42)* %in, i32* %out) #0 {
+  entry:
+    %0 = getelementptr i32, i32 addrspace(42)* %in, i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0
+    %val = load atomic i32, i32 addrspace(42)* %0 seq_cst, align 4
+    store i32 %val, i32* %out
+    ret void
+  }
+
+  define amdgpu_kernel void @invalid_store(i32 %in, i32 addrspace(42)* %out) #0 {
+  entry:
+    store atomic i32 %in, i32 addrspace(42)* %out syncscope("agent") seq_cst, align 4
+    ret void
+  }
+
+  define amdgpu_kernel void @invalid_cmpxchg(i32 addrspace(42)* %out, i32 %in, i32 %old) #0 {
+  entry:
+    %gep = getelementptr i32, i32 addrspace(42)* %out, i32 4
+    %val = cmpxchg volatile i32 addrspace(42)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst
+    ret void
+  }
+
+  define amdgpu_kernel void @invalid_rmw(i32 addrspace(42)* %out, i32 %in) #0 {
+  entry:
+    %val = atomicrmw volatile xchg i32 addrspace(42)* %out, i32 %in syncscope("wavefront") seq_cst
+    ret void
+  }
+
+  ; Function Attrs: convergent nounwind
+  declare { i1, i64 } @llvm.amdgcn.if.i64(i1) #1
+
+  ; Function Attrs: convergent nounwind
+  declare { i1, i64 } @llvm.amdgcn.else.i64.i64(i64) #1
+
+  ; Function Attrs: convergent nounwind readnone
+  declare i64 @llvm.amdgcn.break.i64.i64(i64) #2
+
+  ; Function Attrs: convergent nounwind readnone
+  declare i64 @llvm.amdgcn.if.break.i64.i64(i1, i64) #2
+
+  ; Function Attrs: convergent nounwind readnone
+  declare i64 @llvm.amdgcn.else.break.i64.i64.i64(i64, i64) #2
+
+  ; Function Attrs: convergent nounwind
+  declare i1 @llvm.amdgcn.loop.i64(i64) #1
+
+  ; Function Attrs: convergent nounwind
+  declare void @llvm.amdgcn.end.cf.i64(i64) #1
+
+  attributes #0 = { "target-cpu"="gfx803" }
+  attributes #1 = { convergent nounwind }
+  attributes #2 = { convergent nounwind readnone }
+
+  !0 = !{}
+
+...
+---
+
+# CHECK: error: <unknown>:0:0: in function invalid_load void (i32 addrspace(42)*, i32*): Unsupported atomic address space
+
+name:            invalid_load
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+registers:
+liveins:
+  - { reg: '$sgpr0_sgpr1', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+constants:
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0_sgpr1
+
+    renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM renamable $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4)
+    renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4)
+    $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $sgpr2_sgpr3, implicit $exec
+    renamable $vgpr2 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load seq_cst 4 from %ir.0, addrspace 42)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.out)
+    S_ENDPGM
+
+...
+---
+
+# CHECK: error: <unknown>:0:0: in function invalid_store void (i32, i32 addrspace(42)*): Unsupported atomic address space
+
+name:            invalid_store
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+registers:
+liveins:
+  - { reg: '$sgpr0_sgpr1', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+constants:
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0_sgpr1
+
+    renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4)
+    $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store syncscope("agent") seq_cst 4 into %ir.out, addrspace 42)
+    S_ENDPGM
+
+...
+---
+
+# CHECK: error: <unknown>:0:0: in function invalid_cmpxchg void (i32 addrspace(42)*, i32, i32): Unsupported atomic address space
+
+name:            invalid_cmpxchg
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+registers:
+liveins:
+  - { reg: '$sgpr0_sgpr1', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+constants:
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0_sgpr1
+
+    renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM renamable $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4)
+    renamable $sgpr4 = S_LOAD_DWORD_IMM renamable $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    renamable $sgpr5 = S_LOAD_DWORD_IMM killed renamable $sgpr0_sgpr1, 48, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    renamable $sgpr0 = S_ADD_U32 killed renamable $sgpr2, target-flags(amdgpu-gotprel) 16, implicit-def $scc
+    renamable $sgpr1 = S_ADDC_U32 killed renamable $sgpr3, target-flags(amdgpu-gotprel32-lo) 0, implicit-def dead $scc, implicit killed $scc
+    $vgpr3 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit-def $vgpr2_vgpr3, implicit $sgpr0_sgpr1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_ATOMIC_CMPSWAP killed renamable $vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store syncscope("workgroup") seq_cst seq_cst 4 on %ir.gep, addrspace 42)
+    S_ENDPGM
+
+...
+---
+
+# CHECK: error: <unknown>:0:0: in function invalid_rmw void (i32 addrspace(42)*, i32): Unsupported atomic address space
+
+name:            invalid_rmw
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+registers:
+liveins:
+  - { reg: '$sgpr0_sgpr1', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+constants:
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0_sgpr1
+
+    renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM renamable $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4)
+    renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $sgpr2_sgpr3, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    FLAT_ATOMIC_SWAP killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store syncscope("wavefront") seq_cst 4 on %ir.out, addrspace 42)
+    S_ENDPGM
+
+...
Index: test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll
===================================================================
--- test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll
+++ test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll
@@ -1,14 +1,14 @@
 ; RUN: not llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s 2>&1 | FileCheck %s
 ; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s 2>&1 | FileCheck %s
 
-; CHECK: error: <unknown>:0:0: in function invalid_fence void (): Unsupported synchronization scope
+; CHECK: error: <unknown>:0:0: in function invalid_fence void (): Unsupported atomic synchronization scope
 define amdgpu_kernel void @invalid_fence() {
 entry:
   fence syncscope("invalid") seq_cst
   ret void
 }
 
-; CHECK: error: <unknown>:0:0: in function invalid_load void (i32*, i32*): Unsupported synchronization scope
+; CHECK: error: <unknown>:0:0: in function invalid_load void (i32*, i32*): Unsupported non-inclusive atomic synchronization scope
 define amdgpu_kernel void @invalid_load(
     i32* %in, i32* %out) {
 entry:
@@ -17,7 +17,7 @@
   ret void
 }
 
-; CHECK: error: <unknown>:0:0: in function invalid_store void (i32, i32*): Unsupported synchronization scope
+; CHECK: error: <unknown>:0:0: in function invalid_store void (i32, i32*): Unsupported non-inclusive atomic synchronization scope
 define amdgpu_kernel void @invalid_store(
     i32 %in, i32* %out) {
 entry:
@@ -25,7 +25,7 @@
   ret void
 }
 
-; CHECK: error: <unknown>:0:0: in function invalid_cmpxchg void (i32*, i32, i32): Unsupported synchronization scope
+; CHECK: error: <unknown>:0:0: in function invalid_cmpxchg void (i32*, i32, i32): Unsupported non-inclusive atomic synchronization scope
 define amdgpu_kernel void @invalid_cmpxchg(
     i32* %out, i32 %in, i32 %old) {
 entry:
@@ -34,7 +34,7 @@
   ret void
 }
 
-; CHECK: error: <unknown>:0:0: in function invalid_rmw void (i32*, i32): Unsupported synchronization scope
+; CHECK: error: <unknown>:0:0: in function invalid_rmw void (i32*, i32): Unsupported non-inclusive atomic synchronization scope
 define amdgpu_kernel void @invalid_rmw(
     i32* %out, i32 %in) {
 entry:
Index: test/CodeGen/AMDGPU/memory-legalizer-load.ll
===================================================================
--- test/CodeGen/AMDGPU/memory-legalizer-load.ll
+++ test/CodeGen/AMDGPU/memory-legalizer-load.ll
@@ -7,9 +7,11 @@
 
 ; GCN-LABEL: {{^}}system_unordered:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:   buffer_wbinvl1_vol
+; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
+; GCN-NOT:   buffer_{{wbinvl1_vol|gl._inv}}
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @system_unordered(
     i32* %in, i32* %out) {
@@ -21,9 +23,11 @@
 
 ; GCN-LABEL: {{^}}system_monotonic:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:   buffer_wbinvl1_vol
+; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
+; GCN-NOT:   buffer_{{wbinvl1_vol|gl._inv}}
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @system_monotonic(
     i32* %in, i32* %out) {
@@ -35,7 +39,8 @@
 
 ; GCN-LABEL: {{^}}system_acquire:
 ; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
-; GCN:        flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GCN-NOT:    s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
+; GFX89:      flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89-NEXT: buffer_wbinvl1_vol
 ; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
@@ -49,7 +54,7 @@
 
 ; GCN-LABEL: {{^}}system_seq_cst:
 ; GCN:        s_waitcnt vmcnt(0){{$}}
-; GCN-NEXT:   flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GFX89-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89-NEXT: buffer_wbinvl1_vol
 ; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
@@ -63,9 +68,11 @@
 
 ; GCN-LABEL: {{^}}singlethread_unordered:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:   buffer_wbinvl1_vol
+; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
+; GCN-NOT:   buffer_{{wbinvl1_vol|gl._inv}}
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @singlethread_unordered(
     i32* %in, i32* %out) {
@@ -77,9 +84,11 @@
 
 ; GCN-LABEL: {{^}}singlethread_monotonic:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:   buffer_wbinvl1_vol
+; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
+; GCN-NOT:   buffer_{{wbinvl1_vol|gl._inv}}
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @singlethread_monotonic(
     i32* %in, i32* %out) {
@@ -91,9 +100,11 @@
 
 ; GCN-LABEL: {{^}}singlethread_acquire:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:   buffer_wbinvl1_vol
+; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
+; GCN-NOT:   buffer_{{wbinvl1_vol|gl._inv}}
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @singlethread_acquire(
     i32* %in, i32* %out) {
@@ -105,9 +116,11 @@
 
 ; GCN-LABEL: {{^}}singlethread_seq_cst:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:   buffer_wbinvl1_vol
+; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
+; GCN-NOT:   buffer_{{wbinvl1_vol|gl._inv}}
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @singlethread_seq_cst(
     i32* %in, i32* %out) {
@@ -119,9 +132,11 @@
 
 ; GCN-LABEL: {{^}}agent_unordered:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:   buffer_wbinvl1_vol
+; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
+; GCN-NOT:   buffer_{{wbinvl1_vol|gl._inv}}
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @agent_unordered(
     i32* %in, i32* %out) {
@@ -133,9 +148,11 @@
 
 ; GCN-LABEL: {{^}}agent_monotonic:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:   buffer_wbinvl1_vol
+; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
+; GCN-NOT:   buffer_{{wbinvl1_vol|gl._inv}}
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @agent_monotonic(
     i32* %in, i32* %out) {
@@ -147,7 +164,8 @@
 
 ; GCN-LABEL: {{^}}agent_acquire:
 ; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
-; GCN:        flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GCN-NOT:    s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
+; GFX89:      flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89-NEXT: buffer_wbinvl1_vol
 ; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
@@ -161,7 +179,7 @@
 
 ; GCN-LABEL: {{^}}agent_seq_cst:
 ; GCN:        s_waitcnt vmcnt(0){{$}}
-; GCN-NEXT:   flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GFX89-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89-NEXT: buffer_wbinvl1_vol
 ; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
@@ -175,9 +193,11 @@
 
 ; GCN-LABEL: {{^}}workgroup_unordered:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GFX89:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
+; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:   buffer_wbinvl1_vol
+; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
+; GCN-NOT:   buffer_{{wbinvl1_vol|gl._inv}}
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @workgroup_unordered(
     i32* %in, i32* %out) {
@@ -189,9 +209,11 @@
 
 ; GCN-LABEL: {{^}}workgroup_monotonic:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:   buffer_wbinvl1_vol
+; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
+; GCN-NOT:   buffer_{{wbinvl1_vol|gl._inv}}
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @workgroup_monotonic(
     i32* %in, i32* %out) {
@@ -203,6 +225,7 @@
 
 ; GCN-LABEL: {{^}}workgroup_acquire:
 ; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GFX89:      flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GFX89-NOT:  s_waitcnt vmcnt(0){{$}}
 ; GFX89-NOT:  buffer_wbinvl1_vol
@@ -231,9 +254,11 @@
 
 ; GCN-LABEL: {{^}}wavefront_unordered:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:   buffer_wbinvl1_vol
+; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
+; GCN-NOT:   buffer_{{wbinvl1_vol|gl._inv}}
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @wavefront_unordered(
     i32* %in, i32* %out) {
@@ -245,9 +270,11 @@
 
 ; GCN-LABEL: {{^}}wavefront_monotonic:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:   buffer_wbinvl1_vol
+; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
+; GCN-NOT:   buffer_{{wbinvl1_vol|gl._inv}}
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @wavefront_monotonic(
     i32* %in, i32* %out) {
@@ -259,9 +286,11 @@
 
 ; GCN-LABEL: {{^}}wavefront_acquire:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:   buffer_wbinvl1_vol
+; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
+; GCN-NOT:   buffer_{{wbinvl1_vol|gl._inv}}
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @wavefront_acquire(
     i32* %in, i32* %out) {
@@ -273,9 +302,11 @@
 
 ; GCN-LABEL: {{^}}wavefront_seq_cst:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:   buffer_wbinvl1_vol
+; GCN-NOT:   s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
+; GCN-NOT:   buffer_{{wbinvl1_vol|gl._inv}}
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @wavefront_seq_cst(
     i32* %in, i32* %out) {
Index: test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir
===================================================================
--- test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir
+++ test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer  %s -o - | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer %s -o - | FileCheck %s
 
 --- |
   ; ModuleID = 'memory-legalizer-multiple-mem-operands.ll'
@@ -155,7 +155,7 @@
     S_WAITCNT 127
     $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc
     $vgpr0 = V_ADD_I32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, implicit $exec :: (load syncscope("agent") unordered 4 from %ir.else_ptr), (load syncscope("workgroup") seq_cst 4 from %ir.if_ptr)
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, implicit $exec :: (load syncscope("agent") unordered 4 from %ir.out), (load syncscope("workgroup") seq_cst 4 from %ir.if_ptr)
     $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5
     $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec
     S_WAITCNT 3952