Index: llvm/trunk/docs/AMDGPUUsage.rst
===================================================================
--- llvm/trunk/docs/AMDGPUUsage.rst
+++ llvm/trunk/docs/AMDGPUUsage.rst
@@ -323,62 +323,80 @@
   .. table:: AMDHSA LLVM Sync Scopes
      :name: amdgpu-amdhsa-llvm-sync-scopes-table
 
-     ================ ==========================================================
-     LLVM Sync Scope  Description
-     ================ ==========================================================
-     *none*           The default: ``system``.
-
-                      Synchronizes with, and participates in modification and
-                      seq_cst total orderings with, other operations (except
-                      image operations) for all address spaces (except private,
-                      or generic that accesses private) provided the other
-                      operation's sync scope is:
-
-                      - ``system``.
-                      - ``agent`` and executed by a thread on the same agent.
-                      - ``workgroup`` and executed by a thread in the same
-                        workgroup.
-                      - ``wavefront`` and executed by a thread in the same
-                        wavefront.
-
-     ``agent``        Synchronizes with, and participates in modification and
-                      seq_cst total orderings with, other operations (except
-                      image operations) for all address spaces (except private,
-                      or generic that accesses private) provided the other
-                      operation's sync scope is:
-
-                      - ``system`` or ``agent`` and executed by a thread on the
-                        same agent.
-                      - ``workgroup`` and executed by a thread in the same
-                        workgroup.
-                      - ``wavefront`` and executed by a thread in the same
-                        wavefront.
-
-     ``workgroup``    Synchronizes with, and participates in modification and
-                      seq_cst total orderings with, other operations (except
-                      image operations) for all address spaces (except private,
-                      or generic that accesses private) provided the other
-                      operation's sync scope is:
-
-                      - ``system``, ``agent`` or ``workgroup`` and executed by a
-                        thread in the same workgroup.
-                      - ``wavefront`` and executed by a thread in the same
-                        wavefront.
-
-     ``wavefront``    Synchronizes with, and participates in modification and
-                      seq_cst total orderings with, other operations (except
-                      image operations) for all address spaces (except private,
-                      or generic that accesses private) provided the other
-                      operation's sync scope is:
-
-                      - ``system``, ``agent``, ``workgroup`` or ``wavefront``
-                        and executed by a thread in the same wavefront.
-
-     ``singlethread`` Only synchronizes with, and participates in modification
-                      and seq_cst total orderings with, other operations (except
-                      image operations) running in the same thread for all
-                      address spaces (for example, in signal handlers).
-     ================ ==========================================================
+     ======================= ===================================================
+     LLVM Sync Scope         Description
+     ======================= ===================================================
+     *none*                  The default: ``system``.
+
+                             Synchronizes with, and participates in modification
+                             and seq_cst total orderings with, other operations
+                             (except image operations) for all address spaces
+                             (except private, or generic that accesses private)
+                             provided the other operation's sync scope is:
+
+                             - ``system``.
+                             - ``agent`` and executed by a thread on the same
+                               agent.
+                             - ``workgroup`` and executed by a thread in the
+                               same workgroup.
+                             - ``wavefront`` and executed by a thread in the
+                               same wavefront.
+
+     ``agent``               Synchronizes with, and participates in modification
+                             and seq_cst total orderings with, other operations
+                             (except image operations) for all address spaces
+                             (except private, or generic that accesses private)
+                             provided the other operation's sync scope is:
+
+                             - ``system`` or ``agent`` and executed by a thread
+                               on the same agent.
+                             - ``workgroup`` and executed by a thread in the
+                               same workgroup.
+                             - ``wavefront`` and executed by a thread in the
+                               same wavefront.
+
+     ``workgroup``           Synchronizes with, and participates in modification
+                             and seq_cst total orderings with, other operations
+                             (except image operations) for all address spaces
+                             (except private, or generic that accesses private)
+                             provided the other operation's sync scope is:
+
+                             - ``system``, ``agent`` or ``workgroup`` and
+                               executed by a thread in the same workgroup.
+                             - ``wavefront`` and executed by a thread in the
+                               same wavefront.
+
+     ``wavefront``           Synchronizes with, and participates in modification
+                             and seq_cst total orderings with, other operations
+                             (except image operations) for all address spaces
+                             (except private, or generic that accesses private)
+                             provided the other operation's sync scope is:
+
+                             - ``system``, ``agent``, ``workgroup`` or
+                               ``wavefront`` and executed by a thread in the
+                               same wavefront.
+
+     ``singlethread``        Only synchronizes with, and participates in
+                             modification and seq_cst total orderings with,
+                             other operations (except image operations) running
+                             in the same thread for all address spaces (for
+                             example, in signal handlers).
+
+     ``one-as``              Same as ``system`` but only synchronizes with other
+                             operations within the same address space.
+
+     ``agent-one-as``        Same as ``agent`` but only synchronizes with other
+                             operations within the same address space.
+
+     ``workgroup-one-as``    Same as ``workgroup`` but only synchronizes with
+                             other operations within the same address space.
+
+     ``wavefront-one-as``    Same as ``wavefront`` but only synchronizes with
+                             other operations within the same address space.
+
+     ``singlethread-one-as`` Same as ``singlethread`` but only synchronizes with
+                             other operations within the same address space.
+     ======================= ===================================================
 
 AMDGPU Intrinsics
 -----------------
Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
@@ -29,12 +29,22 @@
   // All supported memory/synchronization scopes can be found here:
   //   http://llvm.org/docs/AMDGPUUsage.html#memory-scopes
 
-  /// Agent synchronization scope ID.
+  /// Agent synchronization scope ID (cross address space).
   SyncScope::ID AgentSSID;
-  /// Workgroup synchronization scope ID.
+  /// Workgroup synchronization scope ID (cross address space).
   SyncScope::ID WorkgroupSSID;
-  /// Wavefront synchronization scope ID.
+  /// Wavefront synchronization scope ID (cross address space).
   SyncScope::ID WavefrontSSID;
+  /// System synchronization scope ID (single address space).
+  SyncScope::ID SystemOneAddressSpaceSSID;
+  /// Agent synchronization scope ID (single address space).
+  SyncScope::ID AgentOneAddressSpaceSSID;
+  /// Workgroup synchronization scope ID (single address space).
+  SyncScope::ID WorkgroupOneAddressSpaceSSID;
+  /// Wavefront synchronization scope ID (single address space).
+  SyncScope::ID WavefrontOneAddressSpaceSSID;
+  /// Single thread synchronization scope ID (single address space).
+  SyncScope::ID SingleThreadOneAddressSpaceSSID;
 
   /// In AMDGPU target synchronization scopes are inclusive, meaning a
   /// larger synchronization scope is inclusive of a smaller synchronization
@@ -43,35 +53,70 @@
   /// \returns \p SSID's inclusion ordering, or "None" if \p SSID is not
   /// supported by the AMDGPU target.
   Optional<uint8_t> getSyncScopeInclusionOrdering(SyncScope::ID SSID) const {
-    if (SSID == SyncScope::SingleThread)
+    if (SSID == SyncScope::SingleThread ||
+        SSID == getSingleThreadOneAddressSpaceSSID())
       return 0;
-    else if (SSID == getWavefrontSSID())
+    else if (SSID == getWavefrontSSID() ||
+             SSID == getWavefrontOneAddressSpaceSSID())
       return 1;
-    else if (SSID == getWorkgroupSSID())
+    else if (SSID == getWorkgroupSSID() ||
+             SSID == getWorkgroupOneAddressSpaceSSID())
       return 2;
-    else if (SSID == getAgentSSID())
+    else if (SSID == getAgentSSID() ||
+             SSID == getAgentOneAddressSpaceSSID())
       return 3;
-    else if (SSID == SyncScope::System)
+    else if (SSID == SyncScope::System ||
+             SSID == getSystemOneAddressSpaceSSID())
       return 4;
 
     return None;
   }
 
+  /// \returns True if \p SSID is restricted to single address space, false
+  /// otherwise
+  bool isOneAddressSpace(SyncScope::ID SSID) const {
+    return SSID == getSingleThreadOneAddressSpaceSSID() ||
+        SSID == getWavefrontOneAddressSpaceSSID() ||
+        SSID == getWorkgroupOneAddressSpaceSSID() ||
+        SSID == getAgentOneAddressSpaceSSID() ||
+        SSID == getSystemOneAddressSpaceSSID();
+  }
+
 public:
   AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI);
 
-  /// \returns Agent synchronization scope ID.
+  /// \returns Agent synchronization scope ID (cross address space).
   SyncScope::ID getAgentSSID() const {
     return AgentSSID;
   }
-  /// \returns Workgroup synchronization scope ID.
+  /// \returns Workgroup synchronization scope ID (cross address space).
   SyncScope::ID getWorkgroupSSID() const {
     return WorkgroupSSID;
   }
-  /// \returns Wavefront synchronization scope ID.
+  /// \returns Wavefront synchronization scope ID (cross address space).
   SyncScope::ID getWavefrontSSID() const {
     return WavefrontSSID;
   }
+  /// \returns System synchronization scope ID (single address space).
+  SyncScope::ID getSystemOneAddressSpaceSSID() const {
+    return SystemOneAddressSpaceSSID;
+  }
+  /// \returns Agent synchronization scope ID (single address space).
+  SyncScope::ID getAgentOneAddressSpaceSSID() const {
+    return AgentOneAddressSpaceSSID;
+  }
+  /// \returns Workgroup synchronization scope ID (single address space).
+  SyncScope::ID getWorkgroupOneAddressSpaceSSID() const {
+    return WorkgroupOneAddressSpaceSSID;
+  }
+  /// \returns Wavefront synchronization scope ID (single address space).
+  SyncScope::ID getWavefrontOneAddressSpaceSSID() const {
+    return WavefrontOneAddressSpaceSSID;
+  }
+  /// \returns Single thread synchronization scope ID (single address space).
+  SyncScope::ID getSingleThreadOneAddressSpaceSSID() const {
+    return SingleThreadOneAddressSpaceSSID;
+  }
 
   /// In AMDGPU target synchronization scopes are inclusive, meaning a
   /// larger synchronization scope is inclusive of a smaller synchronization
@@ -87,7 +132,11 @@
     if (!AIO || !BIO)
       return None;
 
-    return AIO.getValue() > BIO.getValue();
+    bool IsAOneAddressSpace = isOneAddressSpace(A);
+    bool IsBOneAddressSpace = isOneAddressSpace(B);
+
+    return AIO.getValue() >= BIO.getValue() &&
+        (IsAOneAddressSpace == IsBOneAddressSpace || !IsAOneAddressSpace);
   }
 };
 
Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
@@ -23,6 +23,16 @@
   AgentSSID = CTX.getOrInsertSyncScopeID("agent");
   WorkgroupSSID = CTX.getOrInsertSyncScopeID("workgroup");
   WavefrontSSID = CTX.getOrInsertSyncScopeID("wavefront");
+  SystemOneAddressSpaceSSID =
+      CTX.getOrInsertSyncScopeID("one-as");
+  AgentOneAddressSpaceSSID =
+      CTX.getOrInsertSyncScopeID("agent-one-as");
+  WorkgroupOneAddressSpaceSSID =
+      CTX.getOrInsertSyncScopeID("workgroup-one-as");
+  WavefrontOneAddressSpaceSSID =
+      CTX.getOrInsertSyncScopeID("wavefront-one-as");
+  SingleThreadOneAddressSpaceSSID =
+      CTX.getOrInsertSyncScopeID("singlethread-one-as");
 }
 
 } // end namespace llvm
Index: llvm/trunk/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ llvm/trunk/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -417,35 +417,46 @@
 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
                                SIAtomicAddrSpace InstrScope) const {
-  /// TODO: For now assume OpenCL memory model which treats each
-  /// address space as having a separate happens-before relation, and
-  /// so an instruction only has ordering with respect to the address
-  /// space it accesses, and if it accesses multiple address spaces it
-  /// does not require ordering of operations in different address
-  /// spaces.
- if (SSID == SyncScope::System)
+  if (SSID == SyncScope::System)
+    return std::make_tuple(SIAtomicScope::SYSTEM,
+                           SIAtomicAddrSpace::ATOMIC,
+                           true);
+  if (SSID == MMI->getAgentSSID())
+    return std::make_tuple(SIAtomicScope::AGENT,
+                           SIAtomicAddrSpace::ATOMIC,
+                           true);
+  if (SSID == MMI->getWorkgroupSSID())
+    return std::make_tuple(SIAtomicScope::WORKGROUP,
+                           SIAtomicAddrSpace::ATOMIC,
+                           true);
+  if (SSID == MMI->getWavefrontSSID())
+    return std::make_tuple(SIAtomicScope::WAVEFRONT,
+                           SIAtomicAddrSpace::ATOMIC,
+                           true);
+  if (SSID == SyncScope::SingleThread)
+    return std::make_tuple(SIAtomicScope::SINGLETHREAD,
+                           SIAtomicAddrSpace::ATOMIC,
+                           true);
+  if (SSID == MMI->getSystemOneAddressSpaceSSID())
     return std::make_tuple(SIAtomicScope::SYSTEM,
                            SIAtomicAddrSpace::ATOMIC & InstrScope,
                            false);
-  if (SSID == MMI->getAgentSSID())
+  if (SSID == MMI->getAgentOneAddressSpaceSSID())
     return std::make_tuple(SIAtomicScope::AGENT,
                            SIAtomicAddrSpace::ATOMIC & InstrScope,
                            false);
-  if (SSID == MMI->getWorkgroupSSID())
+  if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
     return std::make_tuple(SIAtomicScope::WORKGROUP,
                            SIAtomicAddrSpace::ATOMIC & InstrScope,
                            false);
-  if (SSID == MMI->getWavefrontSSID())
+  if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
     return std::make_tuple(SIAtomicScope::WAVEFRONT,
                            SIAtomicAddrSpace::ATOMIC & InstrScope,
                            false);
-  if (SSID == SyncScope::SingleThread)
+  if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
                            SIAtomicAddrSpace::ATOMIC & InstrScope,
                            false);
-  /// TODO: To support HSA Memory Model need to add additional memory
-  /// scopes that specify that do require cross address space
-  /// ordering.
   return None;
 }
 
@@ -721,13 +732,12 @@
 
   bool VMCnt = false;
   bool LGKMCnt = false;
-  bool EXPCnt = false;
 
   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
-      VMCnt = true;
+      VMCnt |= true;
       break;
     case SIAtomicScope::WORKGROUP:
     case SIAtomicScope::WAVEFRONT:
@@ -751,7 +761,7 @@
       // also synchronizing with global/GDS memory as LDS operations
       // could be reordered with respect to later global/GDS memory
       // operations of the same wave.
-      LGKMCnt = IsCrossAddrSpaceOrdering;
+      LGKMCnt |= IsCrossAddrSpaceOrdering;
       break;
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
@@ -773,7 +783,7 @@
       // also synchronizing with global/LDS memory as GDS operations
       // could be reordered with respect to later global/LDS memory
       // operations of the same wave.
-      EXPCnt = IsCrossAddrSpaceOrdering;
+      LGKMCnt |= IsCrossAddrSpaceOrdering;
       break;
     case SIAtomicScope::WORKGROUP:
     case SIAtomicScope::WAVEFRONT:
@@ -786,11 +796,11 @@
     }
   }
 
-  if (VMCnt || LGKMCnt || EXPCnt) {
+  if (VMCnt || LGKMCnt) {
     unsigned WaitCntImmediate =
       AMDGPU::encodeWaitcnt(IV,
                             VMCnt ? 0 : getVmcntBitMask(IV),
-                            EXPCnt ? 0 : getExpcntBitMask(IV),
+                            getExpcntBitMask(IV),
                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
     Changed = true;
Index: llvm/trunk/test/CodeGen/AMDGPU/atomicrmw-nand.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/atomicrmw-nand.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/atomicrmw-nand.ll
@@ -12,8 +12,10 @@
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_not_b32_e32 v1, v2
 ; GCN-NEXT:    v_or_b32_e32 v1, -5, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_wbinvl1_vol
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GCN-NEXT:    v_mov_b32_e32 v2, v1
 ; GCN-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
@@ -27,9 +27,9 @@
   %tmp1 = zext i32 %tmp to i64
   %tmp2 = getelementptr inbounds [448 x i32], [448 x i32] addrspace(3)* @0, i32 0, i32 %tmp
   %tmp3 = load i32, i32 addrspace(3)* %tmp2, align 4
-  fence syncscope("workgroup") release
+  fence syncscope("workgroup-one-as") release
   tail call void @llvm.amdgcn.s.barrier()
-  fence syncscope("workgroup") acquire
+  fence syncscope("workgroup-one-as") acquire
   %tmp4 = add nsw i32 %tmp3, %tmp3
   %tmp5 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp4, i32 177, i32 15, i32 15, i1 zeroext false)
   %tmp6 = add nsw i32 %tmp5, %tmp4
Index: llvm/trunk/test/CodeGen/AMDGPU/local-atomics-fp.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/local-atomics-fp.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/local-atomics-fp.ll
@@ -34,7 +34,7 @@
 ; HAS-ATOMICS-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000
 ; HAS-ATOMICS: ds_add_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32
 ; HAS-ATOMICS: ds_add_f32 [[V3:v[0-9]+]], [[V0]] offset:64
-; HAS-ATOMICS: s_waitcnt lgkmcnt(1)
+; HAS-ATOMICS: s_waitcnt vmcnt(0) lgkmcnt(0)
 ; HAS-ATOMICS: ds_add_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]]
 define amdgpu_kernel void @lds_ds_fadd(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) {
   %idx.add = add nuw i32 %idx, 4
@@ -49,6 +49,27 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}lds_ds_fadd_one_as:
+; VI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+; HAS-ATOMICS-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000
+; HAS-ATOMICS: ds_add_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32
+; HAS-ATOMICS: ds_add_f32 [[V3:v[0-9]+]], [[V0]] offset:64
+; HAS-ATOMICS: s_waitcnt lgkmcnt(1)
+; HAS-ATOMICS: ds_add_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]]
+define amdgpu_kernel void @lds_ds_fadd_one_as(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) {
+  %idx.add = add nuw i32 %idx, 4
+  %shl0 = shl i32 %idx.add, 3
+  %shl1 = shl i32 %idx.add, 4
+  %ptr0 = inttoptr i32 %shl0 to float addrspace(3)*
+  %ptr1 = inttoptr i32 %shl1 to float addrspace(3)*
+  %a1 = atomicrmw fadd float addrspace(3)* %ptr0, float 4.2e+1 syncscope("one-as") seq_cst
+  %a2 = atomicrmw fadd float addrspace(3)* %ptr1, float 4.2e+1 syncscope("one-as") seq_cst
+  %a3 = atomicrmw fadd float addrspace(3)* %ptrf, float %a1 syncscope("one-as") seq_cst
+  store float %a3, float addrspace(1)* %out
+  ret void
+}
+
 ; GCN-LABEL: {{^}}lds_atomic_fadd_ret_f64:
 ; GCN: ds_read_b64
 ; GCN: v_add_f64
Index: llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-amdpal.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-amdpal.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-amdpal.ll
@@ -6,7 +6,7 @@
 ; FUNC-LABEL: {{^}}system_acquire:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT:   buffer_wbinvl1{{$}}
 ; GCN:        s_endpgm
 define amdgpu_kernel void @system_acquire() {
@@ -18,7 +18,7 @@
 ; FUNC-LABEL: {{^}}system_release:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        s_endpgm
 define amdgpu_kernel void @system_release() {
 entry:
@@ -29,7 +29,7 @@
 ; FUNC-LABEL: {{^}}system_acq_rel:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        buffer_wbinvl1{{$}}
 ; GCN:        s_endpgm
 define amdgpu_kernel void @system_acq_rel() {
@@ -41,7 +41,7 @@
 ; FUNC-LABEL: {{^}}system_seq_cst:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        buffer_wbinvl1{{$}}
 ; GCN:        s_endpgm
 define amdgpu_kernel void @system_seq_cst() {
@@ -50,6 +50,53 @@
   ret void
 }
 
+; FUNC-LABEL: {{^}}system_one_as_acquire:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   buffer_wbinvl1{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @system_one_as_acquire() {
+entry:
+  fence syncscope("one-as") acquire
+  ret void
+}
+
+; FUNC-LABEL: {{^}}system_one_as_release:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @system_one_as_release() {
+entry:
+  fence syncscope("one-as") release
+  ret void
+}
+
+; FUNC-LABEL: {{^}}system_one_as_acq_rel:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        buffer_wbinvl1{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @system_one_as_acq_rel() {
+entry:
+  fence syncscope("one-as") acq_rel
+  ret void
+}
+
+; FUNC-LABEL: {{^}}system_one_as_seq_cst:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        buffer_wbinvl1{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @system_one_as_seq_cst() {
+entry:
+  fence syncscope("one-as") seq_cst
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}singlethread_acquire:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
@@ -90,10 +137,50 @@
   ret void
 }
 
+; FUNC-LABEL: {{^}}singlethread_one_as_acquire:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @singlethread_one_as_acquire() {
+entry:
+  fence syncscope("singlethread-one-as") acquire
+  ret void
+}
+
+; FUNC-LABEL: {{^}}singlethread_one_as_release:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @singlethread_one_as_release() {
+entry:
+  fence syncscope("singlethread-one-as") release
+  ret void
+}
+
+; FUNC-LABEL: {{^}}singlethread_one_as_acq_rel:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @singlethread_one_as_acq_rel() {
+entry:
+  fence syncscope("singlethread-one-as") acq_rel
+  ret void
+}
+
+; FUNC-LABEL: {{^}}singlethread_one_as_seq_cst:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @singlethread_one_as_seq_cst() {
+entry:
+  fence syncscope("singlethread-one-as") seq_cst
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}agent_acquire:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT:   buffer_wbinvl1{{$}}
 ; GCN:        s_endpgm
 define amdgpu_kernel void @agent_acquire() {
@@ -105,7 +192,7 @@
 ; FUNC-LABEL: {{^}}agent_release:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        s_endpgm
 define amdgpu_kernel void @agent_release() {
 entry:
@@ -116,7 +203,7 @@
 ; FUNC-LABEL: {{^}}agent_acq_rel:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        buffer_wbinvl1{{$}}
 ; GCN:        s_endpgm
 define amdgpu_kernel void @agent_acq_rel() {
@@ -128,7 +215,7 @@
 ; FUNC-LABEL: {{^}}agent_seq_cst:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        buffer_wbinvl1{{$}}
 ; GCN:        s_endpgm
 define amdgpu_kernel void @agent_seq_cst() {
@@ -137,9 +224,56 @@
   ret void
 }
 
+; FUNC-LABEL: {{^}}agent_one_as_acquire:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   buffer_wbinvl1{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @agent_one_as_acquire() {
+entry:
+  fence syncscope("agent-one-as") acquire
+  ret void
+}
+
+; FUNC-LABEL: {{^}}agent_one_as_release:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @agent_one_as_release() {
+entry:
+  fence syncscope("agent-one-as") release
+  ret void
+}
+
+; FUNC-LABEL: {{^}}agent_one_as_acq_rel:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        buffer_wbinvl1{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @agent_one_as_acq_rel() {
+entry:
+  fence syncscope("agent-one-as") acq_rel
+  ret void
+}
+
+; FUNC-LABEL: {{^}}agent_one_as_seq_cst:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        buffer_wbinvl1{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @agent_one_as_seq_cst() {
+entry:
+  fence syncscope("agent-one-as") seq_cst
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}workgroup_acquire:
 ; GCN:        %bb.0
-; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_endpgm
 define amdgpu_kernel void @workgroup_acquire() {
@@ -150,7 +284,7 @@
 
 ; FUNC-LABEL: {{^}}workgroup_release:
 ; GCN:        %bb.0
-; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_endpgm
 define amdgpu_kernel void @workgroup_release() {
@@ -161,7 +295,7 @@
 
 ; FUNC-LABEL: {{^}}workgroup_acq_rel:
 ; GCN:        %bb.0
-; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_endpgm
 define amdgpu_kernel void @workgroup_acq_rel() {
@@ -172,7 +306,7 @@
 
 ; FUNC-LABEL: {{^}}workgroup_seq_cst:
 ; GCN:        %bb.0
-; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_endpgm
 define amdgpu_kernel void @workgroup_seq_cst() {
@@ -181,6 +315,50 @@
   ret void
 }
 
+; FUNC-LABEL: {{^}}workgroup_one_as_acquire:
+; GCN:        %bb.0
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @workgroup_one_as_acquire() {
+entry:
+  fence syncscope("workgroup-one-as") acquire
+  ret void
+}
+
+; FUNC-LABEL: {{^}}workgroup_one_as_release:
+; GCN:        %bb.0
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @workgroup_one_as_release() {
+entry:
+  fence syncscope("workgroup-one-as") release
+  ret void
+}
+
+; FUNC-LABEL: {{^}}workgroup_one_as_acq_rel:
+; GCN:        %bb.0
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @workgroup_one_as_acq_rel() {
+entry:
+  fence syncscope("workgroup-one-as") acq_rel
+  ret void
+}
+
+; FUNC-LABEL: {{^}}workgroup_one_as_seq_cst:
+; GCN:        %bb.0
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @workgroup_one_as_seq_cst() {
+entry:
+  fence syncscope("workgroup-one-as") seq_cst
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}wavefront_acquire:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
@@ -220,3 +398,43 @@
   fence syncscope("wavefront") seq_cst
   ret void
 }
+
+; FUNC-LABEL: {{^}}wavefront_one_as_acquire:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @wavefront_one_as_acquire() {
+entry:
+  fence syncscope("wavefront-one-as") acquire
+  ret void
+}
+
+; FUNC-LABEL: {{^}}wavefront_one_as_release:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @wavefront_one_as_release() {
+entry:
+  fence syncscope("wavefront-one-as") release
+  ret void
+}
+
+; FUNC-LABEL: {{^}}wavefront_one_as_acq_rel:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @wavefront_one_as_acq_rel() {
+entry:
+  fence syncscope("wavefront-one-as") acq_rel
+  ret void
+}
+
+; FUNC-LABEL: {{^}}wavefront_one_as_seq_cst:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @wavefront_one_as_seq_cst() {
+entry:
+  fence syncscope("wavefront-one-as") seq_cst
+  ret void
+}
Index: llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll
@@ -2,9 +2,9 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
 
 ; GCN-LABEL: {{^}}system_monotonic_monotonic:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @system_monotonic_monotonic(
     i32* %out, i32 %in, i32 %old) {
@@ -15,9 +15,9 @@
 }
 
 ; GCN-LABEL: {{^}}system_acquire_monotonic:
-; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NEXT:  buffer_wbinvl1_vol
 define amdgpu_kernel void @system_acquire_monotonic(
     i32* %out, i32 %in, i32 %old) {
@@ -28,9 +28,9 @@
 }
 
 ; GCN-LABEL: {{^}}system_release_monotonic:
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:    buffer_wbinvl1_vol
 define amdgpu_kernel void @system_release_monotonic(
     i32* %out, i32 %in, i32 %old) {
@@ -41,9 +41,9 @@
 }
 
 ; GCN-LABEL: {{^}}system_acq_rel_monotonic:
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NEXT:  buffer_wbinvl1_vol
 define amdgpu_kernel void @system_acq_rel_monotonic(
     i32* %out, i32 %in, i32 %old) {
@@ -54,9 +54,9 @@
 }
 
 ; GCN-LABEL: {{^}}system_seq_cst_monotonic:
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NEXT:  buffer_wbinvl1_vol
 define amdgpu_kernel void @system_seq_cst_monotonic(
     i32* %out, i32 %in, i32 %old) {
@@ -67,9 +67,9 @@
 }
 
 ; GCN-LABEL: {{^}}system_acquire_acquire:
-; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NEXT:  buffer_wbinvl1_vol
 define amdgpu_kernel void @system_acquire_acquire(
     i32* %out, i32 %in, i32 %old) {
@@ -80,9 +80,9 @@
 }
 
 ; GCN-LABEL: {{^}}system_release_acquire:
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NEXT:  buffer_wbinvl1_vol
 define amdgpu_kernel void @system_release_acquire(
     i32* %out, i32 %in, i32 %old) {
@@ -93,9 +93,9 @@
 }
 
 ; GCN-LABEL: {{^}}system_acq_rel_acquire:
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NEXT:  buffer_wbinvl1_vol
 define amdgpu_kernel void @system_acq_rel_acquire(
     i32* %out, i32 %in, i32 %old) {
@@ -106,9 +106,9 @@
 }
 
 ; GCN-LABEL: {{^}}system_seq_cst_acquire:
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NEXT:  buffer_wbinvl1_vol
 define amdgpu_kernel void @system_seq_cst_acquire(
     i32* %out, i32 %in, i32 %old) {
@@ -119,9 +119,9 @@
 }
 
 ; GCN-LABEL: {{^}}system_seq_cst_seq_cst:
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NEXT:  buffer_wbinvl1_vol
 define amdgpu_kernel void @system_seq_cst_seq_cst(
     i32* %out, i32 %in, i32 %old) {
@@ -131,11 +131,141 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}singlethread_monotonic_monotonic:
+; GCN-LABEL: {{^}}system_one_as_monotonic_monotonic:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @system_one_as_monotonic_monotonic(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}system_one_as_acquire_monotonic:
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NEXT:  buffer_wbinvl1_vol
+define amdgpu_kernel void @system_one_as_acquire_monotonic(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}system_one_as_release_monotonic:
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    buffer_wbinvl1_vol
+define amdgpu_kernel void @system_one_as_release_monotonic(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}system_one_as_acq_rel_monotonic:
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NEXT:  buffer_wbinvl1_vol
+define amdgpu_kernel void @system_one_as_acq_rel_monotonic(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}system_one_as_seq_cst_monotonic:
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NEXT:  buffer_wbinvl1_vol
+define amdgpu_kernel void @system_one_as_seq_cst_monotonic(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}system_one_as_acquire_acquire:
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NEXT:  buffer_wbinvl1_vol
+define amdgpu_kernel void @system_one_as_acquire_acquire(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire
+  ret void
+}
+
+; GCN-LABEL: {{^}}system_one_as_release_acquire:
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NEXT:  buffer_wbinvl1_vol
+define amdgpu_kernel void @system_one_as_release_acquire(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release acquire
+  ret void
+}
+
+; GCN-LABEL: {{^}}system_one_as_acq_rel_acquire:
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NEXT:  buffer_wbinvl1_vol
+define amdgpu_kernel void @system_one_as_acq_rel_acquire(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire
+  ret void
+}
+
+; GCN-LABEL: {{^}}system_one_as_seq_cst_acquire:
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NEXT:  buffer_wbinvl1_vol
+define amdgpu_kernel void @system_one_as_seq_cst_acquire(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire
+  ret void
+}
+
+; GCN-LABEL: {{^}}system_one_as_seq_cst_seq_cst:
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NEXT:  buffer_wbinvl1_vol
+define amdgpu_kernel void @system_one_as_seq_cst_seq_cst(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}singlethread_monotonic_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @singlethread_monotonic_monotonic(
     i32* %out, i32 %in, i32 %old) {
 entry:
@@ -145,9 +275,9 @@
 }
 
 ; GCN-LABEL: {{^}}singlethread_acquire_monotonic:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @singlethread_acquire_monotonic(
     i32* %out, i32 %in, i32 %old) {
@@ -158,9 +288,9 @@
 }
 
 ; GCN-LABEL: {{^}}singlethread_release_monotonic:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @singlethread_release_monotonic(
     i32* %out, i32 %in, i32 %old) {
@@ -171,9 +301,9 @@
 }
 
 ; GCN-LABEL: {{^}}singlethread_acq_rel_monotonic:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @singlethread_acq_rel_monotonic(
     i32* %out, i32 %in, i32 %old) {
@@ -184,9 +314,9 @@
 }
 
 ; GCN-LABEL: {{^}}singlethread_seq_cst_monotonic:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @singlethread_seq_cst_monotonic(
     i32* %out, i32 %in, i32 %old) {
@@ -197,9 +327,9 @@
 }
 
 ; GCN-LABEL: {{^}}singlethread_acquire_acquire:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @singlethread_acquire_acquire(
     i32* %out, i32 %in, i32 %old) {
@@ -210,9 +340,9 @@
 }
 
 ; GCN-LABEL: {{^}}singlethread_release_acquire:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @singlethread_release_acquire(
     i32* %out, i32 %in, i32 %old) {
@@ -223,9 +353,9 @@
 }
 
 ; GCN-LABEL: {{^}}singlethread_acq_rel_acquire:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @singlethread_acq_rel_acquire(
     i32* %out, i32 %in, i32 %old) {
@@ -236,9 +366,9 @@
 }
 
 ; GCN-LABEL: {{^}}singlethread_seq_cst_acquire:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @singlethread_seq_cst_acquire(
     i32* %out, i32 %in, i32 %old) {
@@ -249,9 +379,9 @@
 }
 
 ; GCN-LABEL: {{^}}singlethread_seq_cst_seq_cst:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @singlethread_seq_cst_seq_cst(
     i32* %out, i32 %in, i32 %old) {
@@ -261,11 +391,141 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}agent_monotonic_monotonic:
+; GCN-LABEL: {{^}}singlethread_one_as_monotonic_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @singlethread_one_as_monotonic_monotonic(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}singlethread_one_as_acquire_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @singlethread_one_as_acquire_monotonic(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}singlethread_one_as_release_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @singlethread_one_as_release_monotonic(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}singlethread_one_as_acq_rel_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @singlethread_one_as_acq_rel_monotonic(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}singlethread_one_as_seq_cst_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @singlethread_one_as_seq_cst_monotonic(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}singlethread_one_as_acquire_acquire:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @singlethread_one_as_acquire_acquire(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire
+  ret void
+}
+
+; GCN-LABEL: {{^}}singlethread_one_as_release_acquire:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @singlethread_one_as_release_acquire(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire
+  ret void
+}
+
+; GCN-LABEL: {{^}}singlethread_one_as_acq_rel_acquire:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @singlethread_one_as_acq_rel_acquire(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire
+  ret void
+}
+
+; GCN-LABEL: {{^}}singlethread_one_as_seq_cst_acquire:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @singlethread_one_as_seq_cst_acquire(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire
+  ret void
+}
+
+; GCN-LABEL: {{^}}singlethread_one_as_seq_cst_seq_cst:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @singlethread_one_as_seq_cst_seq_cst(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}agent_monotonic_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @agent_monotonic_monotonic(
     i32* %out, i32 %in, i32 %old) {
 entry:
@@ -275,9 +535,9 @@
 }
 
 ; GCN-LABEL: {{^}}agent_acquire_monotonic:
-; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NEXT:  buffer_wbinvl1_vol
 define amdgpu_kernel void @agent_acquire_monotonic(
     i32* %out, i32 %in, i32 %old) {
@@ -288,9 +548,9 @@
 }
 
 ; GCN-LABEL: {{^}}agent_release_monotonic:
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:    buffer_wbinvl1_vol
 define amdgpu_kernel void @agent_release_monotonic(
     i32* %out, i32 %in, i32 %old) {
@@ -301,9 +561,9 @@
 }
 
 ; GCN-LABEL: {{^}}agent_acq_rel_monotonic:
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NEXT:  buffer_wbinvl1_vol
 define amdgpu_kernel void @agent_acq_rel_monotonic(
     i32* %out, i32 %in, i32 %old) {
@@ -314,9 +574,9 @@
 }
 
 ; GCN-LABEL: {{^}}agent_seq_cst_monotonic:
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NEXT:  buffer_wbinvl1_vol
 define amdgpu_kernel void @agent_seq_cst_monotonic(
     i32* %out, i32 %in, i32 %old) {
@@ -327,9 +587,9 @@
 }
 
 ; GCN-LABEL: {{^}}agent_acquire_acquire:
-; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NEXT:  buffer_wbinvl1_vol
 define amdgpu_kernel void @agent_acquire_acquire(
     i32* %out, i32 %in, i32 %old) {
@@ -340,9 +600,9 @@
 }
 
 ; GCN-LABEL: {{^}}agent_release_acquire:
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NEXT:  buffer_wbinvl1_vol
 define amdgpu_kernel void @agent_release_acquire(
     i32* %out, i32 %in, i32 %old) {
@@ -353,9 +613,9 @@
 }
 
 ; GCN-LABEL: {{^}}agent_acq_rel_acquire:
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NEXT:  buffer_wbinvl1_vol
 define amdgpu_kernel void @agent_acq_rel_acquire(
     i32* %out, i32 %in, i32 %old) {
@@ -366,9 +626,9 @@
 }
 
 ; GCN-LABEL: {{^}}agent_seq_cst_acquire:
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NEXT:  buffer_wbinvl1_vol
 define amdgpu_kernel void @agent_seq_cst_acquire(
     i32* %out, i32 %in, i32 %old) {
@@ -379,9 +639,9 @@
 }
 
 ; GCN-LABEL: {{^}}agent_seq_cst_seq_cst:
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NEXT:  buffer_wbinvl1_vol
 define amdgpu_kernel void @agent_seq_cst_seq_cst(
     i32* %out, i32 %in, i32 %old) {
@@ -391,11 +651,141 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}workgroup_monotonic_monotonic:
+; GCN-LABEL: {{^}}agent_one_as_monotonic_monotonic:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @agent_one_as_monotonic_monotonic(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}agent_one_as_acquire_monotonic:
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NEXT:  buffer_wbinvl1_vol
+define amdgpu_kernel void @agent_one_as_acquire_monotonic(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}agent_one_as_release_monotonic:
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    buffer_wbinvl1_vol
+define amdgpu_kernel void @agent_one_as_release_monotonic(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}agent_one_as_acq_rel_monotonic:
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NEXT:  buffer_wbinvl1_vol
+define amdgpu_kernel void @agent_one_as_acq_rel_monotonic(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}agent_one_as_seq_cst_monotonic:
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NEXT:  buffer_wbinvl1_vol
+define amdgpu_kernel void @agent_one_as_seq_cst_monotonic(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}agent_one_as_acquire_acquire:
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NEXT:  buffer_wbinvl1_vol
+define amdgpu_kernel void @agent_one_as_acquire_acquire(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire
+  ret void
+}
+
+; GCN-LABEL: {{^}}agent_one_as_release_acquire:
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NEXT:  buffer_wbinvl1_vol
+define amdgpu_kernel void @agent_one_as_release_acquire(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire
+  ret void
+}
+
+; GCN-LABEL: {{^}}agent_one_as_acq_rel_acquire:
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NEXT:  buffer_wbinvl1_vol
+define amdgpu_kernel void @agent_one_as_acq_rel_acquire(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire
+  ret void
+}
+
+; GCN-LABEL: {{^}}agent_one_as_seq_cst_acquire:
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NEXT:  buffer_wbinvl1_vol
+define amdgpu_kernel void @agent_one_as_seq_cst_acquire(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire
+  ret void
+}
+
+; GCN-LABEL: {{^}}agent_one_as_seq_cst_seq_cst:
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NEXT:  buffer_wbinvl1_vol
+define amdgpu_kernel void @agent_one_as_seq_cst_seq_cst(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}workgroup_monotonic_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @workgroup_monotonic_monotonic(
     i32* %out, i32 %in, i32 %old) {
 entry:
@@ -405,9 +795,9 @@
 }
 
 ; GCN-LABEL: {{^}}workgroup_acquire_monotonic:
-; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @workgroup_acquire_monotonic(
     i32* %out, i32 %in, i32 %old) {
@@ -418,9 +808,9 @@
 }
 
 ; GCN-LABEL: {{^}}workgroup_release_monotonic:
-; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:    buffer_wbinvl1_vol
 define amdgpu_kernel void @workgroup_release_monotonic(
     i32* %out, i32 %in, i32 %old) {
@@ -431,9 +821,9 @@
 }
 
 ; GCN-LABEL: {{^}}workgroup_acq_rel_monotonic:
-; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @workgroup_acq_rel_monotonic(
     i32* %out, i32 %in, i32 %old) {
@@ -444,9 +834,9 @@
 }
 
 ; GCN-LABEL: {{^}}workgroup_seq_cst_monotonic:
-; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @workgroup_seq_cst_monotonic(
     i32* %out, i32 %in, i32 %old) {
@@ -457,9 +847,9 @@
 }
 
 ; GCN-LABEL: {{^}}workgroup_acquire_acquire:
-; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @workgroup_acquire_acquire(
     i32* %out, i32 %in, i32 %old) {
@@ -470,9 +860,9 @@
 }
 
 ; GCN-LABEL: {{^}}workgroup_release_acquire:
-; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @workgroup_release_acquire(
     i32* %out, i32 %in, i32 %old) {
@@ -483,9 +873,9 @@
 }
 
 ; GCN-LABEL: {{^}}workgroup_acq_rel_acquire:
-; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @workgroup_acq_rel_acquire(
     i32* %out, i32 %in, i32 %old) {
@@ -496,9 +886,9 @@
 }
 
 ; GCN-LABEL: {{^}}workgroup_seq_cst_acquire:
-; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @workgroup_seq_cst_acquire(
     i32* %out, i32 %in, i32 %old) {
@@ -509,9 +899,9 @@
 }
 
 ; GCN-LABEL: {{^}}workgroup_seq_cst_seq_cst:
-; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @workgroup_seq_cst_seq_cst(
     i32* %out, i32 %in, i32 %old) {
@@ -521,11 +911,141 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}wavefront_monotonic_monotonic:
+; GCN-LABEL: {{^}}workgroup_one_as_monotonic_monotonic:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @workgroup_one_as_monotonic_monotonic(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}workgroup_one_as_acquire_monotonic:
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @workgroup_one_as_acquire_monotonic(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}workgroup_one_as_release_monotonic:
+; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    buffer_wbinvl1_vol
+define amdgpu_kernel void @workgroup_one_as_release_monotonic(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}workgroup_one_as_acq_rel_monotonic:
+; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @workgroup_one_as_acq_rel_monotonic(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}workgroup_one_as_seq_cst_monotonic:
+; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @workgroup_one_as_seq_cst_monotonic(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}workgroup_one_as_acquire_acquire:
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @workgroup_one_as_acquire_acquire(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire
+  ret void
+}
+
+; GCN-LABEL: {{^}}workgroup_one_as_release_acquire:
+; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @workgroup_one_as_release_acquire(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire
+  ret void
+}
+
+; GCN-LABEL: {{^}}workgroup_one_as_acq_rel_acquire:
+; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @workgroup_one_as_acq_rel_acquire(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire
+  ret void
+}
+
+; GCN-LABEL: {{^}}workgroup_one_as_seq_cst_acquire:
+; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @workgroup_one_as_seq_cst_acquire(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire
+  ret void
+}
+
+; GCN-LABEL: {{^}}workgroup_one_as_seq_cst_seq_cst:
+; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:        flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @workgroup_one_as_seq_cst_seq_cst(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}wavefront_monotonic_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @wavefront_monotonic_monotonic(
     i32* %out, i32 %in, i32 %old) {
 entry:
@@ -535,9 +1055,9 @@
 }
 
 ; GCN-LABEL: {{^}}wavefront_acquire_monotonic:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @wavefront_acquire_monotonic(
     i32* %out, i32 %in, i32 %old) {
@@ -548,9 +1068,9 @@
 }
 
 ; GCN-LABEL: {{^}}wavefront_release_monotonic:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @wavefront_release_monotonic(
     i32* %out, i32 %in, i32 %old) {
@@ -561,9 +1081,9 @@
 }
 
 ; GCN-LABEL: {{^}}wavefront_acq_rel_monotonic:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @wavefront_acq_rel_monotonic(
     i32* %out, i32 %in, i32 %old) {
@@ -574,9 +1094,9 @@
 }
 
 ; GCN-LABEL: {{^}}wavefront_seq_cst_monotonic:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @wavefront_seq_cst_monotonic(
     i32* %out, i32 %in, i32 %old) {
@@ -587,9 +1107,9 @@
 }
 
 ; GCN-LABEL: {{^}}wavefront_acquire_acquire:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @wavefront_acquire_acquire(
     i32* %out, i32 %in, i32 %old) {
@@ -600,9 +1120,9 @@
 }
 
 ; GCN-LABEL: {{^}}wavefront_release_acquire:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @wavefront_release_acquire(
     i32* %out, i32 %in, i32 %old) {
@@ -613,9 +1133,9 @@
 }
 
 ; GCN-LABEL: {{^}}wavefront_acq_rel_acquire:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @wavefront_acq_rel_acquire(
     i32* %out, i32 %in, i32 %old) {
@@ -626,9 +1146,9 @@
 }
 
 ; GCN-LABEL: {{^}}wavefront_seq_cst_acquire:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @wavefront_seq_cst_acquire(
     i32* %out, i32 %in, i32 %old) {
@@ -639,9 +1159,9 @@
 }
 
 ; GCN-LABEL: {{^}}wavefront_seq_cst_seq_cst:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @wavefront_seq_cst_seq_cst(
     i32* %out, i32 %in, i32 %old) {
@@ -650,3 +1170,133 @@
   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst
   ret void
 }
+
+; GCN-LABEL: {{^}}wavefront_one_as_monotonic_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @wavefront_one_as_monotonic_monotonic(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}wavefront_one_as_acquire_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @wavefront_one_as_acquire_monotonic(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}wavefront_one_as_release_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @wavefront_one_as_release_monotonic(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}wavefront_one_as_acq_rel_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @wavefront_one_as_acq_rel_monotonic(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}wavefront_one_as_seq_cst_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @wavefront_one_as_seq_cst_monotonic(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}wavefront_one_as_acquire_acquire:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @wavefront_one_as_acquire_acquire(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire
+  ret void
+}
+
+; GCN-LABEL: {{^}}wavefront_one_as_release_acquire:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @wavefront_one_as_release_acquire(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire
+  ret void
+}
+
+; GCN-LABEL: {{^}}wavefront_one_as_acq_rel_acquire:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @wavefront_one_as_acq_rel_acquire(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire
+  ret void
+}
+
+; GCN-LABEL: {{^}}wavefront_one_as_seq_cst_acquire:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @wavefront_one_as_seq_cst_acquire(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire
+  ret void
+}
+
+; GCN-LABEL: {{^}}wavefront_one_as_seq_cst_seq_cst:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @wavefront_one_as_seq_cst_seq_cst(
+    i32* %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, i32* %out, i32 4
+  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst
+  ret void
+}
Index: llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-atomic-fence.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-atomic-fence.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-atomic-fence.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX8,GFX68 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX8,GFX68 %s
 
-; FUNC-LABEL: {{^}}system_acquire:
+; FUNC-LABEL: {{^}}system_one_as_acquire:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
 ; GFX6:       s_waitcnt vmcnt(0){{$}}
@@ -10,6 +10,232 @@
 ; GFX8:       s_waitcnt vmcnt(0){{$}}
 ; GFX8-NEXT:  buffer_wbinvl1_vol{{$}}
 ; GCN:        s_endpgm
+define amdgpu_kernel void @system_one_as_acquire() {
+entry:
+  fence syncscope("one-as") acquire
+  ret void
+}
+
+; FUNC-LABEL: {{^}}system_one_as_release:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @system_one_as_release() {
+entry:
+  fence syncscope("one-as") release
+  ret void
+}
+
+; FUNC-LABEL: {{^}}system_one_as_acq_rel:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GFX6:       buffer_wbinvl1{{$}}
+; GFX8:       buffer_wbinvl1_vol{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @system_one_as_acq_rel() {
+entry:
+  fence syncscope("one-as") acq_rel
+  ret void
+}
+
+; FUNC-LABEL: {{^}}system_one_as_seq_cst:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GFX6:       buffer_wbinvl1{{$}}
+; GFX8:       buffer_wbinvl1_vol{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @system_one_as_seq_cst() {
+entry:
+  fence syncscope("one-as") seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}singlethread_one_as_acquire:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @singlethread_one_as_acquire() {
+entry:
+  fence syncscope("singlethread-one-as") acquire
+  ret void
+}
+
+; FUNC-LABEL: {{^}}singlethread_one_as_release:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @singlethread_one_as_release() {
+entry:
+  fence syncscope("singlethread-one-as") release
+  ret void
+}
+
+; FUNC-LABEL: {{^}}singlethread_one_as_acq_rel:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @singlethread_one_as_acq_rel() {
+entry:
+  fence syncscope("singlethread-one-as") acq_rel
+  ret void
+}
+
+; FUNC-LABEL: {{^}}singlethread_one_as_seq_cst:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @singlethread_one_as_seq_cst() {
+entry:
+  fence syncscope("singlethread-one-as") seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}agent_one_as_acquire:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GFX6:       s_waitcnt vmcnt(0){{$}}
+; GFX6-NEXT:  buffer_wbinvl1{{$}}
+; GFX8:       s_waitcnt vmcnt(0){{$}}
+; GFX8-NEXT:  buffer_wbinvl1_vol{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @agent_one_as_acquire() {
+entry:
+  fence syncscope("agent-one-as") acquire
+  ret void
+}
+
+; FUNC-LABEL: {{^}}agent_one_as_release:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @agent_one_as_release() {
+entry:
+  fence syncscope("agent-one-as") release
+  ret void
+}
+
+; FUNC-LABEL: {{^}}agent_one_as_acq_rel:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GFX6:       buffer_wbinvl1{{$}}
+; GFX8:       buffer_wbinvl1_vol{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @agent_one_as_acq_rel() {
+entry:
+  fence syncscope("agent-one-as") acq_rel
+  ret void
+}
+
+; FUNC-LABEL: {{^}}agent_one_as_seq_cst:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GFX6:       buffer_wbinvl1{{$}}
+; GFX8:       buffer_wbinvl1_vol{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @agent_one_as_seq_cst() {
+entry:
+  fence syncscope("agent-one-as") seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}workgroup_one_as_acquire:
+; GCN:        %bb.0
+; GFX68-NOT:  s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @workgroup_one_as_acquire() {
+entry:
+  fence syncscope("workgroup-one-as") acquire
+  ret void
+}
+
+; FUNC-LABEL: {{^}}workgroup_one_as_release:
+; GCN:        %bb.0
+; GFX68-NOT:  s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @workgroup_one_as_release() {
+entry:
+  fence syncscope("workgroup-one-as") release
+  ret void
+}
+
+; FUNC-LABEL: {{^}}workgroup_one_as_acq_rel:
+; GCN:        %bb.0
+; GFX68-NOT:  s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @workgroup_one_as_acq_rel() {
+entry:
+  fence syncscope("workgroup-one-as") acq_rel
+  ret void
+}
+
+; FUNC-LABEL: {{^}}workgroup_one_as_seq_cst:
+; GCN:        %bb.0
+; GFX68-NOT:  s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @workgroup_one_as_seq_cst() {
+entry:
+  fence syncscope("workgroup-one-as") seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}wavefront_one_as_acquire:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @wavefront_one_as_acquire() {
+entry:
+  fence syncscope("wavefront-one-as") acquire
+  ret void
+}
+
+; FUNC-LABEL: {{^}}wavefront_one_as_release:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @wavefront_one_as_release() {
+entry:
+  fence syncscope("wavefront-one-as") release
+  ret void
+}
+
+; FUNC-LABEL: {{^}}wavefront_one_as_acq_rel:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @wavefront_one_as_acq_rel() {
+entry:
+  fence syncscope("wavefront-one-as") acq_rel
+  ret void
+}
+
+; FUNC-LABEL: {{^}}wavefront_one_as_seq_cst:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @wavefront_one_as_seq_cst() {
+entry:
+  fence syncscope("wavefront-one-as") seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}system_acquire:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GFX6:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX6-NEXT:  buffer_wbinvl1{{$}}
+; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX8-NEXT:  buffer_wbinvl1_vol{{$}}
+; GCN:        s_endpgm
 define amdgpu_kernel void @system_acquire() {
 entry:
   fence acquire
@@ -19,7 +245,7 @@
 ; FUNC-LABEL: {{^}}system_release:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        s_endpgm
 define amdgpu_kernel void @system_release() {
 entry:
@@ -30,7 +256,7 @@
 ; FUNC-LABEL: {{^}}system_acq_rel:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX6:       buffer_wbinvl1{{$}}
 ; GFX8:       buffer_wbinvl1_vol{{$}}
 ; GCN:        s_endpgm
@@ -43,7 +269,7 @@
 ; FUNC-LABEL: {{^}}system_seq_cst:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX6:       buffer_wbinvl1{{$}}
 ; GFX8:       buffer_wbinvl1_vol{{$}}
 ; GCN:        s_endpgm
@@ -96,9 +322,9 @@
 ; FUNC-LABEL: {{^}}agent_acquire:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
-; GFX6:       s_waitcnt vmcnt(0){{$}}
+; GFX6:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX6-NEXT:  buffer_wbinvl1{{$}}
-; GFX8:       s_waitcnt vmcnt(0){{$}}
+; GFX8:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NEXT:  buffer_wbinvl1_vol{{$}}
 ; GCN:        s_endpgm
 define amdgpu_kernel void @agent_acquire() {
@@ -110,7 +336,7 @@
 ; FUNC-LABEL: {{^}}agent_release:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        s_endpgm
 define amdgpu_kernel void @agent_release() {
 entry:
@@ -121,7 +347,7 @@
 ; FUNC-LABEL: {{^}}agent_acq_rel:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX6:       buffer_wbinvl1{{$}}
 ; GFX8:       buffer_wbinvl1_vol{{$}}
 ; GCN:        s_endpgm
@@ -134,7 +360,7 @@
 ; FUNC-LABEL: {{^}}agent_seq_cst:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX6:       buffer_wbinvl1{{$}}
 ; GFX8:       buffer_wbinvl1_vol{{$}}
 ; GCN:        s_endpgm
@@ -146,7 +372,7 @@
 
 ; FUNC-LABEL: {{^}}workgroup_acquire:
 ; GCN:        %bb.0
-; GFX68-NOT:  s_waitcnt vmcnt(0){{$}}
+; GFX68-NOT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_endpgm
 define amdgpu_kernel void @workgroup_acquire() {
@@ -157,7 +383,7 @@
 
 ; FUNC-LABEL: {{^}}workgroup_release:
 ; GCN:        %bb.0
-; GFX68-NOT:  s_waitcnt vmcnt(0){{$}}
+; GFX68-NOT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_endpgm
 define amdgpu_kernel void @workgroup_release() {
@@ -168,7 +394,7 @@
 
 ; FUNC-LABEL: {{^}}workgroup_acq_rel:
 ; GCN:        %bb.0
-; GFX68-NOT:  s_waitcnt vmcnt(0){{$}}
+; GFX68-NOT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_endpgm
 define amdgpu_kernel void @workgroup_acq_rel() {
@@ -179,7 +405,7 @@
 
 ; FUNC-LABEL: {{^}}workgroup_seq_cst:
 ; GCN:        %bb.0
-; GFX68-NOT:  s_waitcnt vmcnt(0){{$}}
+; GFX68-NOT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_endpgm
 define amdgpu_kernel void @workgroup_seq_cst() {
Index: llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir
+++ llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir
@@ -104,7 +104,7 @@
     S_WAITCNT 127
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
     S_WAITCNT 3952
-    BUFFER_ATOMIC_SMAX_ADDR64 killed $vgpr0, killed $vgpr1_vgpr2, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 400, 0, implicit $exec :: (volatile load seq_cst 4 from %ir.gep)
+    BUFFER_ATOMIC_SMAX_ADDR64 killed $vgpr0, killed $vgpr1_vgpr2, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 400, 0, implicit $exec :: (volatile load syncscope("one-as") seq_cst 4 from %ir.gep)
 
   bb.2.exit:
     liveins: $sgpr2_sgpr3
Index: llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll
@@ -1,11 +1,311 @@
 ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
 
-; GCN-LABEL: {{^}}system_monotonic:
+; GCN-LABEL: {{^}}system_one_as_monotonic:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @system_one_as_monotonic(
+    i32* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}system_one_as_acquire:
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN:        flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NEXT:  buffer_wbinvl1_vol
+define amdgpu_kernel void @system_one_as_acquire(
+    i32* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acquire
+  ret void
+}
+
+; GCN-LABEL: {{^}}system_one_as_release:
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    buffer_wbinvl1_vol
+define amdgpu_kernel void @system_one_as_release(
+    i32* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") release
+  ret void
+}
+
+; GCN-LABEL: {{^}}system_one_as_acq_rel:
+; GCN:         s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:    flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; GCN-NEXT:    s_waitcnt vmcnt(0){{$}}
+; GFX8-NEXT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @system_one_as_acq_rel(
+    i32* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acq_rel
+  ret void
+}
+
+; GCN-LABEL: {{^}}system_one_as_seq_cst:
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NEXT:  buffer_wbinvl1_vol
+define amdgpu_kernel void @system_one_as_seq_cst(
+    i32* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}singlethread_one_as_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @singlethread_one_as_monotonic(
+    i32* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}singlethread_one_as_acquire:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @singlethread_one_as_acquire(
+    i32* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire
+  ret void
+}
+
+; GCN-LABEL: {{^}}singlethread_one_as_release:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @singlethread_one_as_release(
+    i32* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") release
+  ret void
+}
+
+; GCN-LABEL: {{^}}singlethread_one_as_acq_rel:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @singlethread_one_as_acq_rel(
+    i32* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel
+  ret void
+}
+
+; GCN-LABEL: {{^}}singlethread_one_as_seq_cst:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @singlethread_one_as_seq_cst(
+    i32* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}agent_one_as_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @agent_one_as_monotonic(
+    i32* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}agent_one_as_acquire:
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN:        flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NEXT:  buffer_wbinvl1_vol
+define amdgpu_kernel void @agent_one_as_acquire(
+    i32* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acquire
+  ret void
+}
+
+; GCN-LABEL: {{^}}agent_one_as_release:
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    buffer_wbinvl1_vol
+define amdgpu_kernel void @agent_one_as_release(
+    i32* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") release
+  ret void
+}
+
+; GCN-LABEL: {{^}}agent_one_as_acq_rel:
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NEXT:  buffer_wbinvl1_vol
+define amdgpu_kernel void @agent_one_as_acq_rel(
+    i32* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acq_rel
+  ret void
+}
+
+; GCN-LABEL: {{^}}agent_one_as_seq_cst:
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NEXT:  buffer_wbinvl1_vol
+define amdgpu_kernel void @agent_one_as_seq_cst(
+    i32* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}workgroup_one_as_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @workgroup_one_as_monotonic(
+    i32* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}workgroup_one_as_acquire:
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN:        flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @workgroup_one_as_acquire(
+    i32* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acquire
+  ret void
+}
+
+; GCN-LABEL: {{^}}workgroup_one_as_release:
+; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:        flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    buffer_wbinvl1_vol
+define amdgpu_kernel void @workgroup_one_as_release(
+    i32* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") release
+  ret void
+}
+
+; GCN-LABEL: {{^}}workgroup_one_as_acq_rel:
+; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:        flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @workgroup_one_as_acq_rel(
+    i32* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acq_rel
+  ret void
+}
+
+; GCN-LABEL: {{^}}workgroup_one_as_seq_cst:
+; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:        flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @workgroup_one_as_seq_cst(
+    i32* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}wavefront_one_as_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @wavefront_one_as_monotonic(
+    i32* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") monotonic
+  ret void
+}
+
+; GCN-LABEL: {{^}}wavefront_one_as_acquire:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @wavefront_one_as_acquire(
+    i32* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acquire
+  ret void
+}
+
+; GCN-LABEL: {{^}}wavefront_one_as_release:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @wavefront_one_as_release(
+    i32* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") release
+  ret void
+}
+
+; GCN-LABEL: {{^}}wavefront_one_as_acq_rel:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @wavefront_one_as_acq_rel(
+    i32* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acq_rel
+  ret void
+}
+
+; GCN-LABEL: {{^}}wavefront_one_as_seq_cst:
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN:       flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
+define amdgpu_kernel void @wavefront_one_as_seq_cst(
+    i32* %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}system_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @system_monotonic(
     i32* %out, i32 %in) {
 entry:
@@ -14,9 +314,9 @@
 }
 
 ; GCN-LABEL: {{^}}system_acquire:
-; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NEXT:  buffer_wbinvl1_vol
 define amdgpu_kernel void @system_acquire(
     i32* %out, i32 %in) {
@@ -26,9 +326,9 @@
 }
 
 ; GCN-LABEL: {{^}}system_release:
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT:   flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:    buffer_wbinvl1_vol
 define amdgpu_kernel void @system_release(
     i32* %out, i32 %in) {
@@ -38,9 +338,9 @@
 }
 
 ; GCN-LABEL: {{^}}system_acq_rel:
-; GCN:         s_waitcnt vmcnt(0){{$}}
+; GCN:         s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT:    flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-; GCN-NEXT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NEXT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @system_acq_rel(
     i32* %out, i32 %in) {
@@ -50,9 +350,9 @@
 }
 
 ; GCN-LABEL: {{^}}system_seq_cst:
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT:   flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NEXT:  buffer_wbinvl1_vol
 define amdgpu_kernel void @system_seq_cst(
     i32* %out, i32 %in) {
@@ -62,9 +362,9 @@
 }
 
 ; GCN-LABEL: {{^}}singlethread_monotonic:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @singlethread_monotonic(
     i32* %out, i32 %in) {
@@ -74,9 +374,9 @@
 }
 
 ; GCN-LABEL: {{^}}singlethread_acquire:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @singlethread_acquire(
     i32* %out, i32 %in) {
@@ -86,9 +386,9 @@
 }
 
 ; GCN-LABEL: {{^}}singlethread_release:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @singlethread_release(
     i32* %out, i32 %in) {
@@ -98,9 +398,9 @@
 }
 
 ; GCN-LABEL: {{^}}singlethread_acq_rel:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @singlethread_acq_rel(
     i32* %out, i32 %in) {
@@ -110,9 +410,9 @@
 }
 
 ; GCN-LABEL: {{^}}singlethread_seq_cst:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @singlethread_seq_cst(
     i32* %out, i32 %in) {
@@ -122,9 +422,9 @@
 }
 
 ; GCN-LABEL: {{^}}agent_monotonic:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @agent_monotonic(
     i32* %out, i32 %in) {
@@ -134,9 +434,9 @@
 }
 
 ; GCN-LABEL: {{^}}agent_acquire:
-; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NEXT:  buffer_wbinvl1_vol
 define amdgpu_kernel void @agent_acquire(
     i32* %out, i32 %in) {
@@ -146,9 +446,9 @@
 }
 
 ; GCN-LABEL: {{^}}agent_release:
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT:   flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:    buffer_wbinvl1_vol
 define amdgpu_kernel void @agent_release(
     i32* %out, i32 %in) {
@@ -158,9 +458,9 @@
 }
 
 ; GCN-LABEL: {{^}}agent_acq_rel:
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT:   flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NEXT:  buffer_wbinvl1_vol
 define amdgpu_kernel void @agent_acq_rel(
     i32* %out, i32 %in) {
@@ -170,9 +470,9 @@
 }
 
 ; GCN-LABEL: {{^}}agent_seq_cst:
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT:   flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NEXT:  buffer_wbinvl1_vol
 define amdgpu_kernel void @agent_seq_cst(
     i32* %out, i32 %in) {
@@ -182,9 +482,9 @@
 }
 
 ; GCN-LABEL: {{^}}workgroup_monotonic:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @workgroup_monotonic(
     i32* %out, i32 %in) {
@@ -194,9 +494,9 @@
 }
 
 ; GCN-LABEL: {{^}}workgroup_acquire:
-; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @workgroup_acquire(
     i32* %out, i32 %in) {
@@ -206,9 +506,9 @@
 }
 
 ; GCN-LABEL: {{^}}workgroup_release:
-; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:    buffer_wbinvl1_vol
 define amdgpu_kernel void @workgroup_release(
     i32* %out, i32 %in) {
@@ -218,9 +518,9 @@
 }
 
 ; GCN-LABEL: {{^}}workgroup_acq_rel:
-; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @workgroup_acq_rel(
     i32* %out, i32 %in) {
@@ -230,9 +530,9 @@
 }
 
 ; GCN-LABEL: {{^}}workgroup_seq_cst:
-; GFX8-NOT:   s_waitcnt vmcnt(0){{$}}
+; GFX8-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX8-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @workgroup_seq_cst(
     i32* %out, i32 %in) {
@@ -242,9 +542,9 @@
 }
 
 ; GCN-LABEL: {{^}}wavefront_monotonic:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @wavefront_monotonic(
     i32* %out, i32 %in) {
@@ -254,9 +554,9 @@
 }
 
 ; GCN-LABEL: {{^}}wavefront_acquire:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @wavefront_acquire(
     i32* %out, i32 %in) {
@@ -266,9 +566,9 @@
 }
 
 ; GCN-LABEL: {{^}}wavefront_release:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @wavefront_release(
     i32* %out, i32 %in) {
@@ -278,9 +578,9 @@
 }
 
 ; GCN-LABEL: {{^}}wavefront_acq_rel:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @wavefront_acq_rel(
     i32* %out, i32 %in) {
@@ -290,9 +590,9 @@
 }
 
 ; GCN-LABEL: {{^}}wavefront_seq_cst:
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:       flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:   buffer_wbinvl1_vol
 define amdgpu_kernel void @wavefront_seq_cst(
     i32* %out, i32 %in) {
Index: llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir
+++ llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir
@@ -11,7 +11,7 @@
 
     $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3
     $vgpr1 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $sgpr2_sgpr3, implicit $exec
-    renamable $vgpr2 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load seq_cst 4 from `i32 addrspace(42)* undef`)
+    renamable $vgpr2 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(42)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -30,7 +30,7 @@
     $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store syncscope("agent") seq_cst 4 into `i32 addrspace(42)* undef`)
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store syncscope("agent-one-as") seq_cst 4 into `i32 addrspace(42)* undef`)
     S_ENDPGM 0
 
 ...
@@ -47,7 +47,7 @@
     $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec, implicit $exec
     $vgpr1 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $exec
     $vgpr2 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
-    FLAT_ATOMIC_CMPSWAP killed renamable $vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store syncscope("workgroup") seq_cst seq_cst 4 on `i32 addrspace(42)* undef`)
+    FLAT_ATOMIC_CMPSWAP killed renamable $vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store syncscope("workgroup-one-as") seq_cst seq_cst 4 on `i32 addrspace(42)* undef`)
     S_ENDPGM 0
 
 ...
@@ -63,7 +63,7 @@
     $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3
     $vgpr1 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $sgpr2_sgpr3, implicit $exec
     $vgpr2 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    FLAT_ATOMIC_SWAP killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store syncscope("wavefront") seq_cst 4 on `i32 addrspace(42)* undef`)
+    FLAT_ATOMIC_SWAP killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store syncscope("wavefront-one-as") seq_cst 4 on `i32 addrspace(42)* undef`)
     S_ENDPGM 0
 
 ...
Index: llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-load.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-load.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-load.ll
@@ -5,282 +5,282 @@
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 
-; GCN-LABEL: {{^}}system_unordered:
+; GCN-LABEL: {{^}}system_one_as_unordered:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define amdgpu_kernel void @system_unordered(
+define amdgpu_kernel void @system_one_as_unordered(
     i32* %in, i32* %out) {
 entry:
-  %val = load atomic i32, i32* %in unordered, align 4
+  %val = load atomic i32, i32* %in syncscope("one-as") unordered, align 4
   store i32 %val, i32* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}system_monotonic:
+; GCN-LABEL: {{^}}system_one_as_monotonic:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define amdgpu_kernel void @system_monotonic(
+define amdgpu_kernel void @system_one_as_monotonic(
     i32* %in, i32* %out) {
 entry:
-  %val = load atomic i32, i32* %in monotonic, align 4
+  %val = load atomic i32, i32* %in syncscope("one-as") monotonic, align 4
   store i32 %val, i32* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}system_acquire:
+; GCN-LABEL: {{^}}system_one_as_acquire:
 ; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
 ; GCN:        flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89-NEXT: buffer_wbinvl1_vol
 ; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define amdgpu_kernel void @system_acquire(
+define amdgpu_kernel void @system_one_as_acquire(
     i32* %in, i32* %out) {
 entry:
-  %val = load atomic i32, i32* %in acquire, align 4
+  %val = load atomic i32, i32* %in syncscope("one-as") acquire, align 4
   store i32 %val, i32* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}system_seq_cst:
+; GCN-LABEL: {{^}}system_one_as_seq_cst:
 ; GCN:        s_waitcnt vmcnt(0){{$}}
 ; GCN-NEXT:   flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89-NEXT: buffer_wbinvl1_vol
 ; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define amdgpu_kernel void @system_seq_cst(
+define amdgpu_kernel void @system_one_as_seq_cst(
     i32* %in, i32* %out) {
 entry:
-  %val = load atomic i32, i32* %in seq_cst, align 4
+  %val = load atomic i32, i32* %in syncscope("one-as") seq_cst, align 4
   store i32 %val, i32* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}singlethread_unordered:
+; GCN-LABEL: {{^}}singlethread_one_as_unordered:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define amdgpu_kernel void @singlethread_unordered(
+define amdgpu_kernel void @singlethread_one_as_unordered(
     i32* %in, i32* %out) {
 entry:
-  %val = load atomic i32, i32* %in syncscope("singlethread") unordered, align 4
+  %val = load atomic i32, i32* %in syncscope("singlethread-one-as") unordered, align 4
   store i32 %val, i32* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}singlethread_monotonic:
+; GCN-LABEL: {{^}}singlethread_one_as_monotonic:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define amdgpu_kernel void @singlethread_monotonic(
+define amdgpu_kernel void @singlethread_one_as_monotonic(
     i32* %in, i32* %out) {
 entry:
-  %val = load atomic i32, i32* %in syncscope("singlethread") monotonic, align 4
+  %val = load atomic i32, i32* %in syncscope("singlethread-one-as") monotonic, align 4
   store i32 %val, i32* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}singlethread_acquire:
+; GCN-LABEL: {{^}}singlethread_one_as_acquire:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define amdgpu_kernel void @singlethread_acquire(
+define amdgpu_kernel void @singlethread_one_as_acquire(
     i32* %in, i32* %out) {
 entry:
-  %val = load atomic i32, i32* %in syncscope("singlethread") acquire, align 4
+  %val = load atomic i32, i32* %in syncscope("singlethread-one-as") acquire, align 4
   store i32 %val, i32* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}singlethread_seq_cst:
+; GCN-LABEL: {{^}}singlethread_one_as_seq_cst:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define amdgpu_kernel void @singlethread_seq_cst(
+define amdgpu_kernel void @singlethread_one_as_seq_cst(
     i32* %in, i32* %out) {
 entry:
-  %val = load atomic i32, i32* %in syncscope("singlethread") seq_cst, align 4
+  %val = load atomic i32, i32* %in syncscope("singlethread-one-as") seq_cst, align 4
   store i32 %val, i32* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}agent_unordered:
+; GCN-LABEL: {{^}}agent_one_as_unordered:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define amdgpu_kernel void @agent_unordered(
+define amdgpu_kernel void @agent_one_as_unordered(
     i32* %in, i32* %out) {
 entry:
-  %val = load atomic i32, i32* %in syncscope("agent") unordered, align 4
+  %val = load atomic i32, i32* %in syncscope("agent-one-as") unordered, align 4
   store i32 %val, i32* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}agent_monotonic:
+; GCN-LABEL: {{^}}agent_one_as_monotonic:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define amdgpu_kernel void @agent_monotonic(
+define amdgpu_kernel void @agent_one_as_monotonic(
     i32* %in, i32* %out) {
 entry:
-  %val = load atomic i32, i32* %in syncscope("agent") monotonic, align 4
+  %val = load atomic i32, i32* %in syncscope("agent-one-as") monotonic, align 4
   store i32 %val, i32* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}agent_acquire:
+; GCN-LABEL: {{^}}agent_one_as_acquire:
 ; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
 ; GCN:        flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89-NEXT: buffer_wbinvl1_vol
 ; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define amdgpu_kernel void @agent_acquire(
+define amdgpu_kernel void @agent_one_as_acquire(
     i32* %in, i32* %out) {
 entry:
-  %val = load atomic i32, i32* %in syncscope("agent") acquire, align 4
+  %val = load atomic i32, i32* %in syncscope("agent-one-as") acquire, align 4
   store i32 %val, i32* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}agent_seq_cst:
+; GCN-LABEL: {{^}}agent_one_as_seq_cst:
 ; GCN:        s_waitcnt vmcnt(0){{$}}
 ; GCN-NEXT:   flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89-NEXT: buffer_wbinvl1_vol
 ; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define amdgpu_kernel void @agent_seq_cst(
+define amdgpu_kernel void @agent_one_as_seq_cst(
     i32* %in, i32* %out) {
 entry:
-  %val = load atomic i32, i32* %in syncscope("agent") seq_cst, align 4
+  %val = load atomic i32, i32* %in syncscope("agent-one-as") seq_cst, align 4
   store i32 %val, i32* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}workgroup_unordered:
+; GCN-LABEL: {{^}}workgroup_one_as_unordered:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define amdgpu_kernel void @workgroup_unordered(
+define amdgpu_kernel void @workgroup_one_as_unordered(
     i32* %in, i32* %out) {
 entry:
-  %val = load atomic i32, i32* %in syncscope("workgroup") unordered, align 4
+  %val = load atomic i32, i32* %in syncscope("workgroup-one-as") unordered, align 4
   store i32 %val, i32* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}workgroup_monotonic:
+; GCN-LABEL: {{^}}workgroup_one_as_monotonic:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define amdgpu_kernel void @workgroup_monotonic(
+define amdgpu_kernel void @workgroup_one_as_monotonic(
     i32* %in, i32* %out) {
 entry:
-  %val = load atomic i32, i32* %in syncscope("workgroup") monotonic, align 4
+  %val = load atomic i32, i32* %in syncscope("workgroup-one-as") monotonic, align 4
   store i32 %val, i32* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}workgroup_acquire:
+; GCN-LABEL: {{^}}workgroup_one_as_acquire:
 ; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
 ; GFX89:      flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GFX89-NOT:  s_waitcnt vmcnt(0){{$}}
 ; GFX89-NOT:  buffer_wbinvl1_vol
 ; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define amdgpu_kernel void @workgroup_acquire(
+define amdgpu_kernel void @workgroup_one_as_acquire(
     i32* %in, i32* %out) {
 entry:
-  %val = load atomic i32, i32* %in syncscope("workgroup") acquire, align 4
+  %val = load atomic i32, i32* %in syncscope("workgroup-one-as") acquire, align 4
   store i32 %val, i32* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}workgroup_seq_cst:
+; GCN-LABEL: {{^}}workgroup_one_as_seq_cst:
 ; GFX89-NOT:  s_waitcnt vmcnt(0){{$}}
 ; GFX89:      flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GFX89-NOT:  s_waitcnt vmcnt(0){{$}}
 ; GFX89-NOT:  buffer_wbinvl1_vol
 ; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define amdgpu_kernel void @workgroup_seq_cst(
+define amdgpu_kernel void @workgroup_one_as_seq_cst(
     i32* %in, i32* %out) {
 entry:
-  %val = load atomic i32, i32* %in syncscope("workgroup") seq_cst, align 4
+  %val = load atomic i32, i32* %in syncscope("workgroup-one-as") seq_cst, align 4
   store i32 %val, i32* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}wavefront_unordered:
+; GCN-LABEL: {{^}}wavefront_one_as_unordered:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define amdgpu_kernel void @wavefront_unordered(
+define amdgpu_kernel void @wavefront_one_as_unordered(
     i32* %in, i32* %out) {
 entry:
-  %val = load atomic i32, i32* %in syncscope("wavefront") unordered, align 4
+  %val = load atomic i32, i32* %in syncscope("wavefront-one-as") unordered, align 4
   store i32 %val, i32* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}wavefront_monotonic:
+; GCN-LABEL: {{^}}wavefront_one_as_monotonic:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define amdgpu_kernel void @wavefront_monotonic(
+define amdgpu_kernel void @wavefront_one_as_monotonic(
     i32* %in, i32* %out) {
 entry:
-  %val = load atomic i32, i32* %in syncscope("wavefront") monotonic, align 4
+  %val = load atomic i32, i32* %in syncscope("wavefront-one-as") monotonic, align 4
   store i32 %val, i32* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}wavefront_acquire:
+; GCN-LABEL: {{^}}wavefront_one_as_acquire:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define amdgpu_kernel void @wavefront_acquire(
+define amdgpu_kernel void @wavefront_one_as_acquire(
     i32* %in, i32* %out) {
 entry:
-  %val = load atomic i32, i32* %in syncscope("wavefront") acquire, align 4
+  %val = load atomic i32, i32* %in syncscope("wavefront-one-as") acquire, align 4
   store i32 %val, i32* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}wavefront_seq_cst:
+; GCN-LABEL: {{^}}wavefront_one_as_seq_cst:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89-NOT: buffer_wbinvl1_vol
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define amdgpu_kernel void @wavefront_seq_cst(
+define amdgpu_kernel void @wavefront_one_as_seq_cst(
     i32* %in, i32* %out) {
 entry:
-  %val = load atomic i32, i32* %in syncscope("wavefront") seq_cst, align 4
+  %val = load atomic i32, i32* %in syncscope("wavefront-one-as") seq_cst, align 4
   store i32 %val, i32* %out
   ret void
 }
@@ -374,4 +374,284 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}system_unordered:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+; GCN:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89-NOT: buffer_wbinvl1_vol
+; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define amdgpu_kernel void @system_unordered(
+    i32* %in, i32* %out) {
+entry:
+  %val = load atomic i32, i32* %in unordered, align 4
+  store i32 %val, i32* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}system_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GCN:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89-NOT: buffer_wbinvl1_vol
+; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define amdgpu_kernel void @system_monotonic(
+    i32* %in, i32* %out) {
+entry:
+  %val = load atomic i32, i32* %in monotonic, align 4
+  store i32 %val, i32* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}system_acquire:
+; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:        flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89-NEXT: buffer_wbinvl1_vol
+; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define amdgpu_kernel void @system_acquire(
+    i32* %in, i32* %out) {
+entry:
+  %val = load atomic i32, i32* %in acquire, align 4
+  store i32 %val, i32* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}system_seq_cst:
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NEXT:   flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89-NEXT: buffer_wbinvl1_vol
+; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define amdgpu_kernel void @system_seq_cst(
+    i32* %in, i32* %out) {
+entry:
+  %val = load atomic i32, i32* %in seq_cst, align 4
+  store i32 %val, i32* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}singlethread_unordered:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+; GCN:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89-NOT: buffer_wbinvl1_vol
+; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define amdgpu_kernel void @singlethread_unordered(
+    i32* %in, i32* %out) {
+entry:
+  %val = load atomic i32, i32* %in syncscope("singlethread") unordered, align 4
+  store i32 %val, i32* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}singlethread_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+; GCN:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89-NOT: buffer_wbinvl1_vol
+; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define amdgpu_kernel void @singlethread_monotonic(
+    i32* %in, i32* %out) {
+entry:
+  %val = load atomic i32, i32* %in syncscope("singlethread") monotonic, align 4
+  store i32 %val, i32* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}singlethread_acquire:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+; GCN:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89-NOT: buffer_wbinvl1_vol
+; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define amdgpu_kernel void @singlethread_acquire(
+    i32* %in, i32* %out) {
+entry:
+  %val = load atomic i32, i32* %in syncscope("singlethread") acquire, align 4
+  store i32 %val, i32* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}singlethread_seq_cst:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+; GCN:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89-NOT: buffer_wbinvl1_vol
+; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define amdgpu_kernel void @singlethread_seq_cst(
+    i32* %in, i32* %out) {
+entry:
+  %val = load atomic i32, i32* %in syncscope("singlethread") seq_cst, align 4
+  store i32 %val, i32* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}agent_unordered:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+; GCN:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89-NOT: buffer_wbinvl1_vol
+; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define amdgpu_kernel void @agent_unordered(
+    i32* %in, i32* %out) {
+entry:
+  %val = load atomic i32, i32* %in syncscope("agent") unordered, align 4
+  store i32 %val, i32* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}agent_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GCN:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89-NOT: buffer_wbinvl1_vol
+; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define amdgpu_kernel void @agent_monotonic(
+    i32* %in, i32* %out) {
+entry:
+  %val = load atomic i32, i32* %in syncscope("agent") monotonic, align 4
+  store i32 %val, i32* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}agent_acquire:
+; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:        flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89-NEXT: buffer_wbinvl1_vol
+; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define amdgpu_kernel void @agent_acquire(
+    i32* %in, i32* %out) {
+entry:
+  %val = load atomic i32, i32* %in syncscope("agent") acquire, align 4
+  store i32 %val, i32* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}agent_seq_cst:
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NEXT:   flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GCN-NEXT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89-NEXT: buffer_wbinvl1_vol
+; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define amdgpu_kernel void @agent_seq_cst(
+    i32* %in, i32* %out) {
+entry:
+  %val = load atomic i32, i32* %in syncscope("agent") seq_cst, align 4
+  store i32 %val, i32* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}workgroup_unordered:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+; GCN:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89-NOT: buffer_wbinvl1_vol
+; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define amdgpu_kernel void @workgroup_unordered(
+    i32* %in, i32* %out) {
+entry:
+  %val = load atomic i32, i32* %in syncscope("workgroup") unordered, align 4
+  store i32 %val, i32* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}workgroup_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+; GCN:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89-NOT: buffer_wbinvl1_vol
+; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define amdgpu_kernel void @workgroup_monotonic(
+    i32* %in, i32* %out) {
+entry:
+  %val = load atomic i32, i32* %in syncscope("workgroup") monotonic, align 4
+  store i32 %val, i32* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}workgroup_acquire:
+; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89:      flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+; GFX89:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89-NOT:  buffer_wbinvl1_vol
+; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define amdgpu_kernel void @workgroup_acquire(
+    i32* %in, i32* %out) {
+entry:
+  %val = load atomic i32, i32* %in syncscope("workgroup") acquire, align 4
+  store i32 %val, i32* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}workgroup_seq_cst:
+; GFX89-NOT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89:      flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+; GFX89:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89-NOT:  buffer_wbinvl1_vol
+; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define amdgpu_kernel void @workgroup_seq_cst(
+    i32* %in, i32* %out) {
+entry:
+  %val = load atomic i32, i32* %in syncscope("workgroup") seq_cst, align 4
+  store i32 %val, i32* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}wavefront_unordered:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+; GCN:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89-NOT: buffer_wbinvl1_vol
+; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define amdgpu_kernel void @wavefront_unordered(
+    i32* %in, i32* %out) {
+entry:
+  %val = load atomic i32, i32* %in syncscope("wavefront") unordered, align 4
+  store i32 %val, i32* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}wavefront_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+; GCN:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89-NOT: buffer_wbinvl1_vol
+; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define amdgpu_kernel void @wavefront_monotonic(
+    i32* %in, i32* %out) {
+entry:
+  %val = load atomic i32, i32* %in syncscope("wavefront") monotonic, align 4
+  store i32 %val, i32* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}wavefront_acquire:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+; GCN:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89-NOT: buffer_wbinvl1_vol
+; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define amdgpu_kernel void @wavefront_acquire(
+    i32* %in, i32* %out) {
+entry:
+  %val = load atomic i32, i32* %in syncscope("wavefront") acquire, align 4
+  store i32 %val, i32* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}wavefront_seq_cst:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+; GCN:       s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX89-NOT: buffer_wbinvl1_vol
+; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define amdgpu_kernel void @wavefront_seq_cst(
+    i32* %in, i32* %out) {
+entry:
+  %val = load atomic i32, i32* %in syncscope("wavefront") seq_cst, align 4
+  store i32 %val, i32* %out
+  ret void
+}
+
 !0 = !{i32 1}
Index: llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-local.mir
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-local.mir
+++ llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-local.mir
@@ -17,7 +17,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") unordered 4 from `i32 addrspace(3)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") unordered 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -41,7 +41,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") monotonic 4 from `i32 addrspace(3)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") monotonic 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -65,7 +65,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") acquire 4 from `i32 addrspace(3)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") acquire 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -89,7 +89,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") seq_cst 4 from `i32 addrspace(3)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") seq_cst 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -113,7 +113,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") unordered 4 from `i32 addrspace(3)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") unordered 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -137,7 +137,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") monotonic 4 from `i32 addrspace(3)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") monotonic 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -161,7 +161,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") acquire 4 from `i32 addrspace(3)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") acquire 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -185,7 +185,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") seq_cst 4 from `i32 addrspace(3)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") seq_cst 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -209,7 +209,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") unordered 4 from `i32 addrspace(3)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") unordered 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -233,7 +233,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") monotonic 4 from `i32 addrspace(3)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") monotonic 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -257,7 +257,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") acquire 4 from `i32 addrspace(3)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") acquire 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -281,7 +281,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") seq_cst 4 from `i32 addrspace(3)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -305,7 +305,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent") unordered 4 from `i32 addrspace(3)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") unordered 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -329,7 +329,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent") monotonic 4 from `i32 addrspace(3)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") monotonic 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -353,7 +353,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent") acquire 4 from `i32 addrspace(3)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") acquire 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -377,7 +377,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent") seq_cst 4 from `i32 addrspace(3)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -401,7 +401,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load unordered 4 from `i32 addrspace(3)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") unordered 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -425,7 +425,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load monotonic 4 from `i32 addrspace(3)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") monotonic 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -449,7 +449,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load acquire 4 from `i32 addrspace(3)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") acquire 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -473,7 +473,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load seq_cst 4 from `i32 addrspace(3)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(3)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -498,7 +498,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") unordered 4 into `i32 addrspace(3)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(3)* undef`)
     S_ENDPGM 0
 
 ...
@@ -520,7 +520,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") monotonic 4 into `i32 addrspace(3)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(3)* undef`)
     S_ENDPGM 0
 
 ...
@@ -542,7 +542,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") release 4 into `i32 addrspace(3)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(3)* undef`)
     S_ENDPGM 0
 
 ...
@@ -564,7 +564,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") seq_cst 4 into `i32 addrspace(3)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(3)* undef`)
     S_ENDPGM 0
 
 ...
@@ -586,7 +586,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") unordered 4 into `i32 addrspace(3)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") unordered 4 into `i32 addrspace(3)* undef`)
     S_ENDPGM 0
 
 ...
@@ -608,7 +608,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") monotonic 4 into `i32 addrspace(3)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") monotonic 4 into `i32 addrspace(3)* undef`)
     S_ENDPGM 0
 
 ...
@@ -630,7 +630,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") release 4 into `i32 addrspace(3)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") release 4 into `i32 addrspace(3)* undef`)
     S_ENDPGM 0
 
 ...
@@ -652,7 +652,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") seq_cst 4 into `i32 addrspace(3)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") seq_cst 4 into `i32 addrspace(3)* undef`)
     S_ENDPGM 0
 
 ...
@@ -674,7 +674,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") unordered 4 into `i32 addrspace(3)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") unordered 4 into `i32 addrspace(3)* undef`)
     S_ENDPGM 0
 
 ...
@@ -696,7 +696,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") monotonic 4 into `i32 addrspace(3)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") monotonic 4 into `i32 addrspace(3)* undef`)
     S_ENDPGM 0
 
 ...
@@ -718,7 +718,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") release 4 into `i32 addrspace(3)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") release 4 into `i32 addrspace(3)* undef`)
     S_ENDPGM 0
 
 ...
@@ -740,7 +740,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") seq_cst 4 into `i32 addrspace(3)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") seq_cst 4 into `i32 addrspace(3)* undef`)
     S_ENDPGM 0
 
 ...
@@ -762,7 +762,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent") unordered 4 into `i32 addrspace(3)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") unordered 4 into `i32 addrspace(3)* undef`)
     S_ENDPGM 0
 
 ...
@@ -784,7 +784,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent") monotonic 4 into `i32 addrspace(3)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") monotonic 4 into `i32 addrspace(3)* undef`)
     S_ENDPGM 0
 
 ...
@@ -806,7 +806,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent") release 4 into `i32 addrspace(3)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") release 4 into `i32 addrspace(3)* undef`)
     S_ENDPGM 0
 
 ...
@@ -828,7 +828,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent") seq_cst 4 into `i32 addrspace(3)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") seq_cst 4 into `i32 addrspace(3)* undef`)
     S_ENDPGM 0
 
 ...
@@ -850,7 +850,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store unordered 4 into `i32 addrspace(3)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") unordered 4 into `i32 addrspace(3)* undef`)
     S_ENDPGM 0
 
 ...
@@ -872,7 +872,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store monotonic 4 into `i32 addrspace(3)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") monotonic 4 into `i32 addrspace(3)* undef`)
     S_ENDPGM 0
 
 ...
@@ -894,7 +894,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store release 4 into `i32 addrspace(3)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") release 4 into `i32 addrspace(3)* undef`)
     S_ENDPGM 0
 
 ...
@@ -916,7 +916,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store seq_cst 4 into `i32 addrspace(3)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") seq_cst 4 into `i32 addrspace(3)* undef`)
     S_ENDPGM 0
 
 ...
@@ -938,7 +938,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") unordered 4 into `i32 addrspace(3)* undef`)
+    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(3)* undef`)
     S_ENDPGM 0
 
 ...
@@ -960,7 +960,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") monotonic 4 into `i32 addrspace(3)* undef`)
+    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(3)* undef`)
     S_ENDPGM 0
 
 ...
@@ -982,7 +982,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") acquire 4 into `i32 addrspace(3)* undef`)
+    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acquire 4 into `i32 addrspace(3)* undef`)
     S_ENDPGM 0
 
 ...
@@ -1004,7 +1004,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") release 4 into `i32 addrspace(3)* undef`)
+    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(3)* undef`)
     S_ENDPGM 0
 
 ...
@@ -1026,7 +1026,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") acq_rel 4 into `i32 addrspace(3)* undef`)
+    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acq_rel 4 into `i32 addrspace(3)* undef`)
     S_ENDPGM 0
 
 ...
@@ -1048,7 +1048,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") seq_cst 4 into `i32 addrspace(3)* undef`)
+    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(3)* undef`)
     S_ENDPGM 0
 
 ...
Index: llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-mesa3d.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-mesa3d.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-mesa3d.ll
@@ -3,10 +3,228 @@
 ; RUN: llc -mtriple=amdgcn--mesa3d -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s
 ; RUN: llc -mtriple=amdgcn--mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s
 
-; FUNC-LABEL: {{^}}system_acquire:
+; FUNC-LABEL: {{^}}system_one_as_acquire:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   buffer_wbinvl1{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @system_one_as_acquire() {
+entry:
+  fence syncscope("one-as") acquire
+  ret void
+}
+
+; FUNC-LABEL: {{^}}system_one_as_release:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @system_one_as_release() {
+entry:
+  fence syncscope("one-as") release
+  ret void
+}
+
+; FUNC-LABEL: {{^}}system_one_as_acq_rel:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        buffer_wbinvl1{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @system_one_as_acq_rel() {
+entry:
+  fence syncscope("one-as") acq_rel
+  ret void
+}
+
+; FUNC-LABEL: {{^}}system_one_as_seq_cst:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        buffer_wbinvl1{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @system_one_as_seq_cst() {
+entry:
+  fence syncscope("one-as") seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}singlethread_one_as_acquire:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @singlethread_one_as_acquire() {
+entry:
+  fence syncscope("singlethread-one-as") acquire
+  ret void
+}
+
+; FUNC-LABEL: {{^}}singlethread_one_as_release:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @singlethread_one_as_release() {
+entry:
+  fence syncscope("singlethread-one-as") release
+  ret void
+}
+
+; FUNC-LABEL: {{^}}singlethread_one_as_acq_rel:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @singlethread_one_as_acq_rel() {
+entry:
+  fence syncscope("singlethread-one-as") acq_rel
+  ret void
+}
+
+; FUNC-LABEL: {{^}}singlethread_one_as_seq_cst:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @singlethread_one_as_seq_cst() {
+entry:
+  fence syncscope("singlethread-one-as") seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}agent_one_as_acquire:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN-NEXT:   buffer_wbinvl1{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @agent_one_as_acquire() {
+entry:
+  fence syncscope("agent-one-as") acquire
+  ret void
+}
+
+; FUNC-LABEL: {{^}}agent_one_as_release:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @agent_one_as_release() {
+entry:
+  fence syncscope("agent-one-as") release
+  ret void
+}
+
+; FUNC-LABEL: {{^}}agent_one_as_acq_rel:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        buffer_wbinvl1{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @agent_one_as_acq_rel() {
+entry:
+  fence syncscope("agent-one-as") acq_rel
+  ret void
+}
+
+; FUNC-LABEL: {{^}}agent_one_as_seq_cst:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        buffer_wbinvl1{{$}}
+; GCN:        s_endpgm
+define amdgpu_kernel void @agent_one_as_seq_cst() {
+entry:
+  fence syncscope("agent-one-as") seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}workgroup_one_as_acquire:
+; GCN:        %bb.0
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @workgroup_one_as_acquire() {
+entry:
+  fence syncscope("workgroup-one-as") acquire
+  ret void
+}
+
+; FUNC-LABEL: {{^}}workgroup_one_as_release:
+; GCN:        %bb.0
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @workgroup_one_as_release() {
+entry:
+  fence syncscope("workgroup-one-as") release
+  ret void
+}
+
+; FUNC-LABEL: {{^}}workgroup_one_as_acq_rel:
+; GCN:        %bb.0
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @workgroup_one_as_acq_rel() {
+entry:
+  fence syncscope("workgroup-one-as") acq_rel
+  ret void
+}
+
+; FUNC-LABEL: {{^}}workgroup_one_as_seq_cst:
+; GCN:        %bb.0
+; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @workgroup_one_as_seq_cst() {
+entry:
+  fence syncscope("workgroup-one-as") seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}wavefront_one_as_acquire:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @wavefront_one_as_acquire() {
+entry:
+  fence syncscope("wavefront-one-as") acquire
+  ret void
+}
+
+; FUNC-LABEL: {{^}}wavefront_one_as_release:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @wavefront_one_as_release() {
+entry:
+  fence syncscope("wavefront-one-as") release
+  ret void
+}
+
+; FUNC-LABEL: {{^}}wavefront_one_as_acq_rel:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @wavefront_one_as_acq_rel() {
+entry:
+  fence syncscope("wavefront-one-as") acq_rel
+  ret void
+}
+
+; FUNC-LABEL: {{^}}wavefront_one_as_seq_cst:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_endpgm
+define amdgpu_kernel void @wavefront_one_as_seq_cst() {
+entry:
+  fence syncscope("wavefront-one-as") seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}system_acquire:
+; GCN:        %bb.0
+; GCN-NOT:    ATOMIC_FENCE
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT:   buffer_wbinvl1{{$}}
 ; GCN:        s_endpgm
 define amdgpu_kernel void @system_acquire() {
@@ -18,7 +236,7 @@
 ; FUNC-LABEL: {{^}}system_release:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        s_endpgm
 define amdgpu_kernel void @system_release() {
 entry:
@@ -29,7 +247,7 @@
 ; FUNC-LABEL: {{^}}system_acq_rel:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        buffer_wbinvl1{{$}}
 ; GCN:        s_endpgm
 define amdgpu_kernel void @system_acq_rel() {
@@ -41,7 +259,7 @@
 ; FUNC-LABEL: {{^}}system_seq_cst:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        buffer_wbinvl1{{$}}
 ; GCN:        s_endpgm
 define amdgpu_kernel void @system_seq_cst() {
@@ -93,7 +311,7 @@
 ; FUNC-LABEL: {{^}}agent_acquire:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT:   buffer_wbinvl1{{$}}
 ; GCN:        s_endpgm
 define amdgpu_kernel void @agent_acquire() {
@@ -105,7 +323,7 @@
 ; FUNC-LABEL: {{^}}agent_release:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        s_endpgm
 define amdgpu_kernel void @agent_release() {
 entry:
@@ -116,7 +334,7 @@
 ; FUNC-LABEL: {{^}}agent_acq_rel:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        buffer_wbinvl1{{$}}
 ; GCN:        s_endpgm
 define amdgpu_kernel void @agent_acq_rel() {
@@ -128,7 +346,7 @@
 ; FUNC-LABEL: {{^}}agent_seq_cst:
 ; GCN:        %bb.0
 ; GCN-NOT:    ATOMIC_FENCE
-; GCN:        s_waitcnt vmcnt(0){{$}}
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN:        buffer_wbinvl1{{$}}
 ; GCN:        s_endpgm
 define amdgpu_kernel void @agent_seq_cst() {
@@ -139,7 +357,7 @@
 
 ; FUNC-LABEL: {{^}}workgroup_acquire:
 ; GCN:        %bb.0
-; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_endpgm
 define amdgpu_kernel void @workgroup_acquire() {
@@ -150,7 +368,7 @@
 
 ; FUNC-LABEL: {{^}}workgroup_release:
 ; GCN:        %bb.0
-; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_endpgm
 define amdgpu_kernel void @workgroup_release() {
@@ -161,7 +379,7 @@
 
 ; FUNC-LABEL: {{^}}workgroup_acq_rel:
 ; GCN:        %bb.0
-; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_endpgm
 define amdgpu_kernel void @workgroup_acq_rel() {
@@ -172,7 +390,7 @@
 
 ; FUNC-LABEL: {{^}}workgroup_seq_cst:
 ; GCN:        %bb.0
-; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
+; GCN-NOT:    s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NOT:    ATOMIC_FENCE
 ; GCN:        s_endpgm
 define amdgpu_kernel void @workgroup_seq_cst() {
Index: llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir
+++ llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir
@@ -55,7 +55,7 @@
     S_WAITCNT 127
     $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc
     $vgpr0 = V_ADD_I32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, implicit $exec :: (load syncscope("agent") unordered 4 from `i32 addrspace(1)* undef`), (load syncscope("workgroup") seq_cst 4 from `[8192 x i32] addrspace(5)* undef`)
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, implicit $exec :: (load syncscope("agent-one-as") unordered 4 from `i32 addrspace(1)* undef`), (load syncscope("workgroup-one-as") seq_cst 4 from `[8192 x i32] addrspace(5)* undef`)
     $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5
     $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec
     S_WAITCNT 3952
Index: llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-region.mir
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-region.mir
+++ llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-region.mir
@@ -17,7 +17,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 1, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") unordered 4 from `i32 addrspace(2)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 1, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") unordered 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -41,7 +41,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") monotonic 4 from `i32 addrspace(2)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") monotonic 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -65,7 +65,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") acquire 4 from `i32 addrspace(2)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") acquire 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -89,7 +89,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") seq_cst 4 from `i32 addrspace(2)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") seq_cst 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -113,7 +113,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") unordered 4 from `i32 addrspace(2)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") unordered 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -137,7 +137,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") monotonic 4 from `i32 addrspace(2)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") monotonic 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -161,7 +161,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") acquire 4 from `i32 addrspace(2)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") acquire 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -185,7 +185,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") seq_cst 4 from `i32 addrspace(2)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") seq_cst 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -209,7 +209,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") unordered 4 from `i32 addrspace(2)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") unordered 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -233,7 +233,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") monotonic 4 from `i32 addrspace(2)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") monotonic 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -257,7 +257,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") acquire 4 from `i32 addrspace(2)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") acquire 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -281,7 +281,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") seq_cst 4 from `i32 addrspace(2)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -305,7 +305,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent") unordered 4 from `i32 addrspace(2)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") unordered 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -329,7 +329,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent") monotonic 4 from `i32 addrspace(2)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") monotonic 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -353,7 +353,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent") acquire 4 from `i32 addrspace(2)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") acquire 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -377,7 +377,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent") seq_cst 4 from `i32 addrspace(2)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -401,7 +401,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load unordered 4 from `i32 addrspace(2)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") unordered 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -425,7 +425,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load monotonic 4 from `i32 addrspace(2)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") monotonic 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -449,7 +449,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load acquire 4 from `i32 addrspace(2)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") acquire 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -473,7 +473,7 @@
     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
     $m0 = S_MOV_B32 -1
     $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
-    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load seq_cst 4 from `i32 addrspace(2)* undef`)
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(2)* undef`)
     $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
     FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
@@ -498,7 +498,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") unordered 4 into `i32 addrspace(2)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(2)* undef`)
     S_ENDPGM 0
 
 ...
@@ -520,7 +520,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") monotonic 4 into `i32 addrspace(2)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(2)* undef`)
     S_ENDPGM 0
 
 ...
@@ -542,7 +542,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") release 4 into `i32 addrspace(2)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(2)* undef`)
     S_ENDPGM 0
 
 ...
@@ -564,7 +564,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") seq_cst 4 into `i32 addrspace(2)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(2)* undef`)
     S_ENDPGM 0
 
 ...
@@ -586,7 +586,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") unordered 4 into `i32 addrspace(2)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") unordered 4 into `i32 addrspace(2)* undef`)
     S_ENDPGM 0
 
 ...
@@ -608,7 +608,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") monotonic 4 into `i32 addrspace(2)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") monotonic 4 into `i32 addrspace(2)* undef`)
     S_ENDPGM 0
 
 ...
@@ -630,7 +630,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") release 4 into `i32 addrspace(2)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") release 4 into `i32 addrspace(2)* undef`)
     S_ENDPGM 0
 
 ...
@@ -652,7 +652,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") seq_cst 4 into `i32 addrspace(2)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") seq_cst 4 into `i32 addrspace(2)* undef`)
     S_ENDPGM 0
 
 ...
@@ -674,7 +674,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") unordered 4 into `i32 addrspace(2)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") unordered 4 into `i32 addrspace(2)* undef`)
     S_ENDPGM 0
 
 ...
@@ -696,7 +696,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") monotonic 4 into `i32 addrspace(2)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") monotonic 4 into `i32 addrspace(2)* undef`)
     S_ENDPGM 0
 
 ...
@@ -718,7 +718,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") release 4 into `i32 addrspace(2)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") release 4 into `i32 addrspace(2)* undef`)
     S_ENDPGM 0
 
 ...
@@ -740,7 +740,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") seq_cst 4 into `i32 addrspace(2)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") seq_cst 4 into `i32 addrspace(2)* undef`)
     S_ENDPGM 0
 
 ...
@@ -762,7 +762,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent") unordered 4 into `i32 addrspace(2)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") unordered 4 into `i32 addrspace(2)* undef`)
     S_ENDPGM 0
 ...
 ---
@@ -783,7 +783,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent") monotonic 4 into `i32 addrspace(2)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") monotonic 4 into `i32 addrspace(2)* undef`)
     S_ENDPGM 0
 
 ...
@@ -805,7 +805,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent") release 4 into `i32 addrspace(2)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") release 4 into `i32 addrspace(2)* undef`)
     S_ENDPGM 0
 
 ...
@@ -827,7 +827,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent") seq_cst 4 into `i32 addrspace(2)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") seq_cst 4 into `i32 addrspace(2)* undef`)
     S_ENDPGM 0
 
 ...
@@ -893,7 +893,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store release 4 into `i32 addrspace(2)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") release 4 into `i32 addrspace(2)* undef`)
     S_ENDPGM 0
 
 ...
@@ -915,7 +915,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store seq_cst 4 into `i32 addrspace(2)* undef`)
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") seq_cst 4 into `i32 addrspace(2)* undef`)
     S_ENDPGM 0
 
 ...
@@ -937,7 +937,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") unordered 4 into `i32 addrspace(2)* undef`)
+    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(2)* undef`)
     S_ENDPGM 0
 
 ...
@@ -959,7 +959,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") monotonic 4 into `i32 addrspace(2)* undef`)
+    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(2)* undef`)
     S_ENDPGM 0
 
 ...
@@ -981,7 +981,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") acquire 4 into `i32 addrspace(2)* undef`)
+    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acquire 4 into `i32 addrspace(2)* undef`)
     S_ENDPGM 0
 
 ...
@@ -1003,7 +1003,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") release 4 into `i32 addrspace(2)* undef`)
+    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(2)* undef`)
     S_ENDPGM 0
 
 ...
@@ -1025,7 +1025,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") acq_rel 4 into `i32 addrspace(2)* undef`)
+    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acq_rel 4 into `i32 addrspace(2)* undef`)
     S_ENDPGM 0
 
 ...
@@ -1047,7 +1047,7 @@
     $m0 = S_MOV_B32 -1
     $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
     $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
-    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") seq_cst 4 into `i32 addrspace(2)* undef`)
+    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(2)* undef`)
     S_ENDPGM 0
 
 ...
Index: llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-store.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-store.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-store.ll
@@ -5,203 +5,203 @@
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 
-; GCN-LABEL: {{^}}system_unordered:
+; GCN-LABEL: {{^}}system_one_as_unordered:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
-define amdgpu_kernel void @system_unordered(
+define amdgpu_kernel void @system_one_as_unordered(
     i32 %in, i32* %out) {
 entry:
-  store atomic i32 %in, i32* %out unordered, align 4
+  store atomic i32 %in, i32* %out syncscope("one-as") unordered, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}system_monotonic:
+; GCN-LABEL: {{^}}system_one_as_monotonic:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
-define amdgpu_kernel void @system_monotonic(
+define amdgpu_kernel void @system_one_as_monotonic(
     i32 %in, i32* %out) {
 entry:
-  store atomic i32 %in, i32* %out monotonic, align 4
+  store atomic i32 %in, i32* %out syncscope("one-as") monotonic, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}system_release:
+; GCN-LABEL: {{^}}system_one_as_release:
 ; GCN:        s_waitcnt vmcnt(0){{$}}
 ; GCN-NEXT:   flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
-define amdgpu_kernel void @system_release(
+define amdgpu_kernel void @system_one_as_release(
     i32 %in, i32* %out) {
 entry:
-  store atomic i32 %in, i32* %out release, align 4
+  store atomic i32 %in, i32* %out syncscope("one-as") release, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}system_seq_cst:
+; GCN-LABEL: {{^}}system_one_as_seq_cst:
 ; GCN:        s_waitcnt vmcnt(0){{$}}
 ; GCN-NEXT:   flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
-define amdgpu_kernel void @system_seq_cst(
+define amdgpu_kernel void @system_one_as_seq_cst(
     i32 %in, i32* %out) {
 entry:
-  store atomic i32 %in, i32* %out seq_cst, align 4
+  store atomic i32 %in, i32* %out syncscope("one-as") seq_cst, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}singlethread_unordered:
+; GCN-LABEL: {{^}}singlethread_one_as_unordered:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
-define amdgpu_kernel void @singlethread_unordered(
+define amdgpu_kernel void @singlethread_one_as_unordered(
     i32 %in, i32* %out) {
 entry:
-  store atomic i32 %in, i32* %out syncscope("singlethread") unordered, align 4
+  store atomic i32 %in, i32* %out syncscope("singlethread-one-as") unordered, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}singlethread_monotonic:
+; GCN-LABEL: {{^}}singlethread_one_as_monotonic:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
-define amdgpu_kernel void @singlethread_monotonic(
+define amdgpu_kernel void @singlethread_one_as_monotonic(
     i32 %in, i32* %out) {
 entry:
-  store atomic i32 %in, i32* %out syncscope("singlethread") monotonic, align 4
+  store atomic i32 %in, i32* %out syncscope("singlethread-one-as") monotonic, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}singlethread_release:
+; GCN-LABEL: {{^}}singlethread_one_as_release:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
-define amdgpu_kernel void @singlethread_release(
+define amdgpu_kernel void @singlethread_one_as_release(
     i32 %in, i32* %out) {
 entry:
-  store atomic i32 %in, i32* %out syncscope("singlethread") release, align 4
+  store atomic i32 %in, i32* %out syncscope("singlethread-one-as") release, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}singlethread_seq_cst:
+; GCN-LABEL: {{^}}singlethread_one_as_seq_cst:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
-define amdgpu_kernel void @singlethread_seq_cst(
+define amdgpu_kernel void @singlethread_one_as_seq_cst(
     i32 %in, i32* %out) {
 entry:
-  store atomic i32 %in, i32* %out syncscope("singlethread") seq_cst, align 4
+  store atomic i32 %in, i32* %out syncscope("singlethread-one-as") seq_cst, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}agent_unordered:
+; GCN-LABEL: {{^}}agent_one_as_unordered:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
-define amdgpu_kernel void @agent_unordered(
+define amdgpu_kernel void @agent_one_as_unordered(
     i32 %in, i32* %out) {
 entry:
-  store atomic i32 %in, i32* %out syncscope("agent") unordered, align 4
+  store atomic i32 %in, i32* %out syncscope("agent-one-as") unordered, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}agent_monotonic:
+; GCN-LABEL: {{^}}agent_one_as_monotonic:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
-define amdgpu_kernel void @agent_monotonic(
+define amdgpu_kernel void @agent_one_as_monotonic(
     i32 %in, i32* %out) {
 entry:
-  store atomic i32 %in, i32* %out syncscope("agent") monotonic, align 4
+  store atomic i32 %in, i32* %out syncscope("agent-one-as") monotonic, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}agent_release:
+; GCN-LABEL: {{^}}agent_one_as_release:
 ; GCN:        s_waitcnt vmcnt(0){{$}}
 ; GCN-NEXT:   flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
-define amdgpu_kernel void @agent_release(
+define amdgpu_kernel void @agent_one_as_release(
     i32 %in, i32* %out) {
 entry:
-  store atomic i32 %in, i32* %out syncscope("agent") release, align 4
+  store atomic i32 %in, i32* %out syncscope("agent-one-as") release, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}agent_seq_cst:
+; GCN-LABEL: {{^}}agent_one_as_seq_cst:
 ; GCN:        s_waitcnt vmcnt(0){{$}}
 ; GCN-NEXT:   flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
-define amdgpu_kernel void @agent_seq_cst(
+define amdgpu_kernel void @agent_one_as_seq_cst(
     i32 %in, i32* %out) {
 entry:
-  store atomic i32 %in, i32* %out syncscope("agent") seq_cst, align 4
+  store atomic i32 %in, i32* %out syncscope("agent-one-as") seq_cst, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}workgroup_unordered:
+; GCN-LABEL: {{^}}workgroup_one_as_unordered:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
-define amdgpu_kernel void @workgroup_unordered(
+define amdgpu_kernel void @workgroup_one_as_unordered(
     i32 %in, i32* %out) {
 entry:
-  store atomic i32 %in, i32* %out syncscope("workgroup") unordered, align 4
+  store atomic i32 %in, i32* %out syncscope("workgroup-one-as") unordered, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}workgroup_monotonic:
+; GCN-LABEL: {{^}}workgroup_one_as_monotonic:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
-define amdgpu_kernel void @workgroup_monotonic(
+define amdgpu_kernel void @workgroup_one_as_monotonic(
     i32 %in, i32* %out) {
 entry:
-  store atomic i32 %in, i32* %out syncscope("workgroup") monotonic, align 4
+  store atomic i32 %in, i32* %out syncscope("workgroup-one-as") monotonic, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}workgroup_release:
+; GCN-LABEL: {{^}}workgroup_one_as_release:
 ; GFX89-NOT:  s_waitcnt vmcnt(0){{$}}
 ; GCN:        flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
-define amdgpu_kernel void @workgroup_release(
+define amdgpu_kernel void @workgroup_one_as_release(
     i32 %in, i32* %out) {
 entry:
-  store atomic i32 %in, i32* %out syncscope("workgroup") release, align 4
+  store atomic i32 %in, i32* %out syncscope("workgroup-one-as") release, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}workgroup_seq_cst:
+; GCN-LABEL: {{^}}workgroup_one_as_seq_cst:
 ; GFX89-NOT:  s_waitcnt vmcnt(0){{$}}
 ; GCN:        flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
-define amdgpu_kernel void @workgroup_seq_cst(
+define amdgpu_kernel void @workgroup_one_as_seq_cst(
     i32 %in, i32* %out) {
 entry:
-  store atomic i32 %in, i32* %out syncscope("workgroup") seq_cst, align 4
+  store atomic i32 %in, i32* %out syncscope("workgroup-one-as") seq_cst, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}wavefront_unordered:
+; GCN-LABEL: {{^}}wavefront_one_as_unordered:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
-define amdgpu_kernel void @wavefront_unordered(
+define amdgpu_kernel void @wavefront_one_as_unordered(
     i32 %in, i32* %out) {
 entry:
-  store atomic i32 %in, i32* %out syncscope("wavefront") unordered, align 4
+  store atomic i32 %in, i32* %out syncscope("wavefront-one-as") unordered, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}wavefront_monotonic:
+; GCN-LABEL: {{^}}wavefront_one_as_monotonic:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
-define amdgpu_kernel void @wavefront_monotonic(
+define amdgpu_kernel void @wavefront_one_as_monotonic(
     i32 %in, i32* %out) {
 entry:
-  store atomic i32 %in, i32* %out syncscope("wavefront") monotonic, align 4
+  store atomic i32 %in, i32* %out syncscope("wavefront-one-as") monotonic, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}wavefront_release:
+; GCN-LABEL: {{^}}wavefront_one_as_release:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
-define amdgpu_kernel void @wavefront_release(
+define amdgpu_kernel void @wavefront_one_as_release(
     i32 %in, i32* %out) {
 entry:
-  store atomic i32 %in, i32* %out syncscope("wavefront") release, align 4
+  store atomic i32 %in, i32* %out syncscope("wavefront-one-as") release, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}wavefront_seq_cst:
+; GCN-LABEL: {{^}}wavefront_one_as_seq_cst:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
-define amdgpu_kernel void @wavefront_seq_cst(
+define amdgpu_kernel void @wavefront_one_as_seq_cst(
     i32 %in, i32* %out) {
 entry:
-  store atomic i32 %in, i32* %out syncscope("wavefront") seq_cst, align 4
+  store atomic i32 %in, i32* %out syncscope("wavefront-one-as") seq_cst, align 4
   ret void
 }
 
@@ -295,4 +295,204 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}system_unordered:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+define amdgpu_kernel void @system_unordered(
+    i32 %in, i32* %out) {
+entry:
+  store atomic i32 %in, i32* %out unordered, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}system_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+define amdgpu_kernel void @system_monotonic(
+    i32 %in, i32* %out) {
+entry:
+  store atomic i32 %in, i32* %out monotonic, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}system_release:
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NEXT:   flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+define amdgpu_kernel void @system_release(
+    i32 %in, i32* %out) {
+entry:
+  store atomic i32 %in, i32* %out release, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}system_seq_cst:
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NEXT:   flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+define amdgpu_kernel void @system_seq_cst(
+    i32 %in, i32* %out) {
+entry:
+  store atomic i32 %in, i32* %out seq_cst, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}singlethread_unordered:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+define amdgpu_kernel void @singlethread_unordered(
+    i32 %in, i32* %out) {
+entry:
+  store atomic i32 %in, i32* %out syncscope("singlethread") unordered, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}singlethread_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+define amdgpu_kernel void @singlethread_monotonic(
+    i32 %in, i32* %out) {
+entry:
+  store atomic i32 %in, i32* %out syncscope("singlethread") monotonic, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}singlethread_release:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+define amdgpu_kernel void @singlethread_release(
+    i32 %in, i32* %out) {
+entry:
+  store atomic i32 %in, i32* %out syncscope("singlethread") release, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}singlethread_seq_cst:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+define amdgpu_kernel void @singlethread_seq_cst(
+    i32 %in, i32* %out) {
+entry:
+  store atomic i32 %in, i32* %out syncscope("singlethread") seq_cst, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}agent_unordered:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+define amdgpu_kernel void @agent_unordered(
+    i32 %in, i32* %out) {
+entry:
+  store atomic i32 %in, i32* %out syncscope("agent") unordered, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}agent_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+define amdgpu_kernel void @agent_monotonic(
+    i32 %in, i32* %out) {
+entry:
+  store atomic i32 %in, i32* %out syncscope("agent") monotonic, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}agent_release:
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NEXT:   flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+define amdgpu_kernel void @agent_release(
+    i32 %in, i32* %out) {
+entry:
+  store atomic i32 %in, i32* %out syncscope("agent") release, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}agent_seq_cst:
+; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN-NEXT:   flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+define amdgpu_kernel void @agent_seq_cst(
+    i32 %in, i32* %out) {
+entry:
+  store atomic i32 %in, i32* %out syncscope("agent") seq_cst, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}workgroup_unordered:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+define amdgpu_kernel void @workgroup_unordered(
+    i32 %in, i32* %out) {
+entry:
+  store atomic i32 %in, i32* %out syncscope("workgroup") unordered, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}workgroup_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+define amdgpu_kernel void @workgroup_monotonic(
+    i32 %in, i32* %out) {
+entry:
+  store atomic i32 %in, i32* %out syncscope("workgroup") monotonic, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}workgroup_release:
+; GFX89-NOT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:        flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+define amdgpu_kernel void @workgroup_release(
+    i32 %in, i32* %out) {
+entry:
+  store atomic i32 %in, i32* %out syncscope("workgroup") release, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}workgroup_seq_cst:
+; GFX89-NOT:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:        flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+define amdgpu_kernel void @workgroup_seq_cst(
+    i32 %in, i32* %out) {
+entry:
+  store atomic i32 %in, i32* %out syncscope("workgroup") seq_cst, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}wavefront_unordered:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+define amdgpu_kernel void @wavefront_unordered(
+    i32 %in, i32* %out) {
+entry:
+  store atomic i32 %in, i32* %out syncscope("wavefront") unordered, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}wavefront_monotonic:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+define amdgpu_kernel void @wavefront_monotonic(
+    i32 %in, i32* %out) {
+entry:
+  store atomic i32 %in, i32* %out syncscope("wavefront") monotonic, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}wavefront_release:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+define amdgpu_kernel void @wavefront_release(
+    i32 %in, i32* %out) {
+entry:
+  store atomic i32 %in, i32* %out syncscope("wavefront") release, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}wavefront_seq_cst:
+; GCN-NOT:   s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN:       flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
+define amdgpu_kernel void @wavefront_seq_cst(
+    i32 %in, i32* %out) {
+entry:
+  store atomic i32 %in, i32* %out syncscope("wavefront") seq_cst, align 4
+  ret void
+}
+
 !0 = !{i32 1}